Patch WebPicDownloading tool -> multithread integration

This commit is contained in:
Jérémi N ‘EndMove’ 2022-09-04 22:29:30 +02:00
parent 27b05b6184
commit 63e67772a3
Signed by: EndMove
GPG Key ID: 65C4A02E1F5371A4
2 changed files with 140 additions and 69 deletions

View File

@ -30,7 +30,7 @@ if __name__ == '__main__':
config = get_config()
# Create utli/model
webpic = WebPicDownloader()
webpic = WebPicDownloader(path=config.get('app_folder'), asynchrone=True)
# Create app controllers
main_controller = MainController(config)

View File

@ -1,44 +1,69 @@
import os
import sys
import threading
from urllib import request
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup, Tag, ResultSet
class WebPicDownloader():
class WebPicDownloader(threading.Thread):
"""
WebPicDownloader
webpicdownloader is a simple tool able to find and download all pictures on a webpage.
This tool is customizable and allows to define the working folder, the headers present
in the http requests which will be emitted, a call back function named messenger which
will be called at each event, and in addition the script is thread safe and allows to
be stopped in the middle of treatment with a simple call to the stop function.
Webpicdownloader is a scraping tool that allows you to browse a web page,
find the images and download them. This tool is easily usable and implementable
in an application. It has been designed to be executed in an integrated thread
in an asynchronous way as well as more classically in a synchronous way. This
tool allows to define 3 callback functions, one for events, one in case of
success and one in case of failure. It also has an integrated entry point
allowing it to be directly executed in terminal mode.
@author EndMove <contact@endmove.eu>
@version 1.1.1
@version 1.2.0
"""
# Variables
path: str = None # Main working folder directory
headers: dict = None # Header parameters
messenger = None # Event callback function
thread_run: bool = None # Idicate if the task can still run (for use in a thread)
dl_setting: dict = None # Parameter dictionary
callbacks: dict = None # Callback dictionary
use_thread: bool = None # Indicates if the script must launch the download in a separate thread
async_run: bool = None # Idicate if the task can still run (for use in a thread)
# Constructor
def __init__(self, path: str = None, headers: dict = None, messenger = None) -> None:
def __init__(self, path: str = None, headers: dict = None, asynchrone: bool = False,
messenger = None, success = None, failure = None) -> None:
"""
Constructor
=> TODO
=> It is important to initialize the WebPicDownloader object properly. The callback
functions can be initialized after the creation of the object.
* :path: -> Folder where the tool will download the images.
* :headers: -> Dictionary allowing to define the different parameters present in the header of the requests sent by WebPic.
* :path: -> Folder in which the tool will create the download folders and place the images.
* :headers: -> Dictionary allowing to define the different parameters present in the header
of the requests sent by WebPic.
* :asynchronous: -> True: launch the download in a thread, False: the opposite.
* :messenger: -> Callback function messenger (see setter).
* :success: -> Callback function success (see setter).
* :failure: -> Callback function failure (see setter).
"""
super().__init__()
self.path = path if path else os.getcwd()
self.headers = headers if headers else {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
self.messenger = messenger if messenger else lambda msg: print(msg)
self.thread_run = True
self.dl_setting = {
'url': None,
'name': None
}
self.callbacks = {
'messenger': messenger if messenger else lambda msg: print(msg),
'success': success if success else lambda: print("Success!"),
'failure': failure if failure else lambda: print("failure!")
}
self.use_thread = asynchrone
self.async_run = True
# Internal functions
def __get_html(self, url: str) -> str:
@ -125,71 +150,117 @@ class WebPicDownloader():
else:
raise ValueError("The folder already exists, it may already contain images")
def __msg(self, message: str) -> None:
"""
Internal Function #do-not-use#
=> Use the messenger callback to send a message.
"""
self.callbacks.get('messenger')(message)
# Public functions
def start_downloading(self, url: str, folder_name: str) -> None:
"""
Start downloading all pictures of a website.
Depending on the configuration the download will either be started in the
main process or in a separate thread.
* :url: -> The url of the website to annalyse.
* :folder_name: -> The name of the folder in which to upload the photos.
"""
# Updating settings
self.dl_setting['url'] = url
self.dl_setting['name'] = folder_name
# Start processing
if self.use_thread:
self.async_run = True
self.start()
else:
self.run()
def stop_downloading(self) -> None:
"""
Stops the download after the current item is processed and exit the downloading thread.
<!> this function is available only if asynchronous option is set to True in constructor. <!>
"""
if self.use_thread:
self.async_run = False
else:
raise Exception("Can not stop a no multi thread execution")
def run(self) -> None:
"""
<!>
This function must not be used! It is an internal function called automatically
when the download task starts.
<!>
"""
try:
# init variables
download_count = 0 # count to 0
download_folder = f"{self.path}/{self.dl_setting.get('name')}/" # format path
website_html = self.__get_html(self.dl_setting.get('url')) # website html
images = self.__find_all_img(website_html) # find all img balises ing html
self.__initialize_folder(download_folder) # initialize download folder
self.__msg(f"WebPicDownloader found {len(images)} images on the website.")
# process pictures
for i, img in enumerate(images):
if self.use_thread and not self.async_run:
return
try:
self.__msg(f"Start downloading image {i}.")
img_link = self.__find_img_link(img)
self.__download_img(img_link, f"{download_folder}image-{i}.{self.__find_image_type(img_link)}")
self.__msg(f"Download of image {i}, done!")
download_count += 1
except Exception as err:
self.__msg(f"ERROR: Unable to process image {i} -> err[{err}].")
self.__msg(f"WebPicDownloader has processed {download_count} images out of {len(images)}.")
self.callbacks.get('success')() # success, launch callback and return
return
except HTTPError as err:
self.__msg(f"ERROR: An http error occured -> err[{err}].")
except (ValueError, URLError) as err:
self.__msg(f"ERROT: An error occured with the url -> err[{err}].")
except Exception as err:
self.__msg(f"ERROR: An unknown error occured -> err[{err}]")
self.callbacks.get('failure')() # failure, launch callback and exit
def set_success_callback(self, callback) -> None:
"""
Setter to define the callback function when the download succeeded.
* :callback: -> the callback function to call when the download is a success.
"""
self.callbacks['success'] = callback
def set_failure_callback(self, callback) -> None:
"""
Setter to define the callback function called when the download fails.
* :callback: -> the callback function to call when the download is a failure.
"""
self.callbacks['failure'] = callback
def set_messenger_callback(self, callback) -> None:
"""
Setter to define the callback function called when new messages arrive.
* :callback: -> the callback function to call when a message event is emited.
"""
self.messenger = callback
def stop(self) -> None:
"""
Stop the downloading and processing of images (method for use in a thread).
"""
self.thread_run = False
sys.exit(0)
def download(self, url: str, folder_name: str) -> bool:
"""
Start downloading all pictures of a website
* :url: -> The url of the website to annalyse.
* :folder_name: -> The name of the folder in which to upload the photos.
* RETURN -> True if success, False else.
"""
try:
count = 0 # count to 0
folder_path = f"{self.path}/{folder_name}/" # format path
html = self.__get_html(url) # website html
images = self.__find_all_img(html) # find all img balises ing html
self.thread_run = True # set thread_run to true
self.__initialize_folder(folder_path) # initialize formatted path
self.messenger(f"WebPicDownloader found {len(images)} images on the website.")
for i, img in enumerate(images):
print("check")
print(id(self))
if not self.thread_run:
print("return")
try:
self.messenger(f"Start downloading image {i}.")
img_link = self.__find_img_link(img)
self.__download_img(img_link, f"{folder_path}image-{i}.{self.__find_image_type(img_link)}")
self.messenger(f"Download of image {i}, done!")
count += 1
except Exception as err:
self.messenger(f"ERROR: Unable to process image {i} -> err[{err}].")
self.messenger(f"WebPicDownloader has processed {count} images out of {len(images)}.")
return True
except HTTPError as err:
self.messenger(f"ERROR: An http error occured -> err[{err}].")
except (ValueError, URLError) as err:
self.messenger(f"ERROT: An error occured with the url -> err[{err}].")
except Exception as err:
self.messenger(f"ERROR: An unknown error occured -> err[{err}]")
return False
self.callbacks['messenger'] = callback
if __name__ == "__main__":
# Internal entry point for testing and consol use.
wpd = WebPicDownloader()
wpd.set_messenger_callback(lambda msg: print(f"--> {msg}"))
while True:
url = input("Website URL ? ")
name = input("Folder name ? ")
wpd.download(url, name)
wpd.start_downloading(url, name)
if "n" == input("Do you want to continue [Y/n] ? ").lower():
break
print("Good bye !")