From 63e67772a33e2690e1297ad113e687894113d9cc Mon Sep 17 00:00:00 2001 From: EndMove Date: Sun, 4 Sep 2022 22:29:30 +0200 Subject: [PATCH] Patch WebPicDownloading tool -> multithread integration --- main.py | 2 +- model/WebPicDownloader.py | 207 +++++++++++++++++++++++++------------- 2 files changed, 140 insertions(+), 69 deletions(-) diff --git a/main.py b/main.py index 7e3e4bf..6043a7d 100644 --- a/main.py +++ b/main.py @@ -30,7 +30,7 @@ if __name__ == '__main__': config = get_config() # Create utli/model - webpic = WebPicDownloader() + webpic = WebPicDownloader(path=config.get('app_folder'), asynchrone=True) # Create app controllers main_controller = MainController(config) diff --git a/model/WebPicDownloader.py b/model/WebPicDownloader.py index 5da7272..62e91c2 100644 --- a/model/WebPicDownloader.py +++ b/model/WebPicDownloader.py @@ -1,44 +1,69 @@ import os -import sys +import threading from urllib import request from urllib.error import HTTPError, URLError from bs4 import BeautifulSoup, Tag, ResultSet -class WebPicDownloader(): +class WebPicDownloader(threading.Thread): """ WebPicDownloader - webpicdownloader is a simple tool able to find and download all pictures on a webpage. - This tool is customizable and allows to define the working folder, the headers present - in the http requests which will be emitted, a call back function named messenger which - will be called at each event, and in addition the script is thread safe and allows to - be stopped in the middle of treatment with a simple call to the stop function. + Webpicdownloader is a scraping tool that allows you to browse a web page, + find the images and download them. This tool is easily usable and implementable + in an application. It has been designed to be executed in an integrated thread + in an asynchronous way as well as more classically in a synchronous way. This + tool allows to define 3 callback functions, one for events, one in case of + success and one in case of failure. It also has an integrated entry point + allowing it to be directly executed in terminal mode. @author EndMove - @version 1.1.1 + @version 1.2.0 """ # Variables path: str = None # Main working folder directory headers: dict = None # Header parameters - messenger = None # Event callback function - thread_run: bool = None # Idicate if the task can still run (for use in a thread) + dl_setting: dict = None # Parameter dictionary + + callbacks: dict = None # Callback dictionary + + use_thread: bool = None # Indicates if the script must launch the download in a separate thread + async_run: bool = None # Idicate if the task can still run (for use in a thread) # Constructor - def __init__(self, path: str = None, headers: dict = None, messenger = None) -> None: + def __init__(self, path: str = None, headers: dict = None, asynchrone: bool = False, + messenger = None, success = None, failure = None) -> None: """ Constructor - => TODO + => It is important to initialize the WebPicDownloader object properly. The callback + functions can be initialized after the creation of the object. - * :path: -> Folder where the tool will download the images. - * :headers: -> Dictionary allowing to define the different parameters present in the header of the requests sent by WebPic. + * :path: -> Folder in which the tool will create the download folders and place the images. + * :headers: -> Dictionary allowing to define the different parameters present in the header + of the requests sent by WebPic. + * :asynchronous: -> True: launch the download in a thread, False: the opposite. + * :messenger: -> Callback function messenger (see setter). + * :success: -> Callback function success (see setter). + * :failure: -> Callback function failure (see setter). """ + super().__init__() self.path = path if path else os.getcwd() self.headers = headers if headers else { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } - self.messenger = messenger if messenger else lambda msg: print(msg) - self.thread_run = True + self.dl_setting = { + 'url': None, + 'name': None + } + + self.callbacks = { + 'messenger': messenger if messenger else lambda msg: print(msg), + 'success': success if success else lambda: print("Success!"), + 'failure': failure if failure else lambda: print("failure!") + } + + self.use_thread = asynchrone + self.async_run = True # Internal functions def __get_html(self, url: str) -> str: @@ -125,71 +150,117 @@ class WebPicDownloader(): else: raise ValueError("The folder already exists, it may already contain images") + def __msg(self, message: str) -> None: + """ + Internal Function #do-not-use# + => Use the messenger callback to send a message. + """ + self.callbacks.get('messenger')(message) + # Public functions + def start_downloading(self, url: str, folder_name: str) -> None: + """ + Start downloading all pictures of a website. + Depending on the configuration the download will either be started in the + main process or in a separate thread. + + * :url: -> The url of the website to annalyse. + * :folder_name: -> The name of the folder in which to upload the photos. + """ + # Updating settings + self.dl_setting['url'] = url + self.dl_setting['name'] = folder_name + + # Start processing + if self.use_thread: + self.async_run = True + self.start() + else: + self.run() + + def stop_downloading(self) -> None: + """ + Stops the download after the current item is processed and exit the downloading thread. + + this function is available only if asynchronous option is set to True in constructor. + """ + if self.use_thread: + self.async_run = False + else: + raise Exception("Can not stop a no multi thread execution") + + def run(self) -> None: + """ + + This function must not be used! It is an internal function called automatically + when the download task starts. + + """ + try: + # init variables + download_count = 0 # count to 0 + download_folder = f"{self.path}/{self.dl_setting.get('name')}/" # format path + website_html = self.__get_html(self.dl_setting.get('url')) # website html + images = self.__find_all_img(website_html) # find all img balises ing html + + self.__initialize_folder(download_folder) # initialize download folder + self.__msg(f"WebPicDownloader found {len(images)} images on the website.") + + # process pictures + for i, img in enumerate(images): + if self.use_thread and not self.async_run: + return + try: + self.__msg(f"Start downloading image {i}.") + img_link = self.__find_img_link(img) + self.__download_img(img_link, f"{download_folder}image-{i}.{self.__find_image_type(img_link)}") + self.__msg(f"Download of image {i}, done!") + download_count += 1 + except Exception as err: + self.__msg(f"ERROR: Unable to process image {i} -> err[{err}].") + self.__msg(f"WebPicDownloader has processed {download_count} images out of {len(images)}.") + self.callbacks.get('success')() # success, launch callback and return + return + + except HTTPError as err: + self.__msg(f"ERROR: An http error occured -> err[{err}].") + except (ValueError, URLError) as err: + self.__msg(f"ERROT: An error occured with the url -> err[{err}].") + except Exception as err: + self.__msg(f"ERROR: An unknown error occured -> err[{err}]") + self.callbacks.get('failure')() # failure, launch callback and exit + + def set_success_callback(self, callback) -> None: + """ + Setter to define the callback function when the download succeeded. + + * :callback: -> the callback function to call when the download is a success. + """ + self.callbacks['success'] = callback + + def set_failure_callback(self, callback) -> None: + """ + Setter to define the callback function called when the download fails. + + * :callback: -> the callback function to call when the download is a failure. + """ + self.callbacks['failure'] = callback + def set_messenger_callback(self, callback) -> None: """ Setter to define the callback function called when new messages arrive. * :callback: -> the callback function to call when a message event is emited. """ - self.messenger = callback - - def stop(self) -> None: - """ - Stop the downloading and processing of images (method for use in a thread). - """ - self.thread_run = False - sys.exit(0) - - def download(self, url: str, folder_name: str) -> bool: - """ - Start downloading all pictures of a website - - * :url: -> The url of the website to annalyse. - * :folder_name: -> The name of the folder in which to upload the photos. - * RETURN -> True if success, False else. - """ - try: - count = 0 # count to 0 - folder_path = f"{self.path}/{folder_name}/" # format path - html = self.__get_html(url) # website html - images = self.__find_all_img(html) # find all img balises ing html - - self.thread_run = True # set thread_run to true - self.__initialize_folder(folder_path) # initialize formatted path - self.messenger(f"WebPicDownloader found {len(images)} images on the website.") - - for i, img in enumerate(images): - print("check") - print(id(self)) - if not self.thread_run: - print("return") - try: - self.messenger(f"Start downloading image {i}.") - img_link = self.__find_img_link(img) - self.__download_img(img_link, f"{folder_path}image-{i}.{self.__find_image_type(img_link)}") - self.messenger(f"Download of image {i}, done!") - count += 1 - except Exception as err: - self.messenger(f"ERROR: Unable to process image {i} -> err[{err}].") - - self.messenger(f"WebPicDownloader has processed {count} images out of {len(images)}.") - return True - except HTTPError as err: - self.messenger(f"ERROR: An http error occured -> err[{err}].") - except (ValueError, URLError) as err: - self.messenger(f"ERROT: An error occured with the url -> err[{err}].") - except Exception as err: - self.messenger(f"ERROR: An unknown error occured -> err[{err}]") - return False + self.callbacks['messenger'] = callback if __name__ == "__main__": + # Internal entry point for testing and consol use. wpd = WebPicDownloader() - wpd.set_messenger_callback(lambda msg: print(f"--> {msg}")) while True: url = input("Website URL ? ") name = input("Folder name ? ") - wpd.download(url, name) + wpd.start_downloading(url, name) if "n" == input("Do you want to continue [Y/n] ? ").lower(): break print("Good bye !") \ No newline at end of file