Patch WebPicDownloading tool -> multithread integration

2022-09-04 22:29:30 +02:00 · 2022-09-04 22:29:30 +02:00 · 63e67772a3
commit 63e67772a3
parent 27b05b6184
2 changed files with 140 additions and 69 deletions
--- a/main.py
+++ b/main.py
@ -30,7 +30,7 @@ if __name__ == '__main__':
    config = get_config()

    # Create utli/model
-    webpic = WebPicDownloader()
+    webpic = WebPicDownloader(path=config.get('app_folder'), asynchrone=True)

    # Create app controllers
    main_controller = MainController(config)
--- a/model/WebPicDownloader.py
+++ b/model/WebPicDownloader.py
@ -1,44 +1,69 @@
 import os
-import sys
+import threading
 from urllib import request
 from urllib.error import HTTPError, URLError
 from bs4 import BeautifulSoup, Tag, ResultSet


-class WebPicDownloader():
+class WebPicDownloader(threading.Thread):
    """
    WebPicDownloader

-    webpicdownloader is a simple tool able to find and download all pictures on a webpage.
-    This tool is customizable and allows to define the working folder, the headers present
-    in the http requests which will be emitted, a call back function named messenger which
-    will be called at each event, and in addition the script is thread safe and allows to
-    be stopped in the middle of treatment with a simple call to the stop function.
+    Webpicdownloader is a scraping tool that allows you to browse a web page,
+    find the images and download them. This tool is easily usable and implementable
+    in an application. It has been designed to be executed in an integrated thread
+    in an asynchronous way as well as more classically in a synchronous way. This
+    tool allows to define 3 callback functions, one for events, one in case of
+    success and one in case of failure. It also has an integrated entry point
+    allowing it to be directly executed in terminal mode.

    @author     EndMove <contact@endmove.eu>
-    @version    1.1.1
+    @version    1.2.0
    """
    # Variables
    path: str = None        # Main working folder directory
    headers: dict = None    # Header parameters
-    messenger = None        # Event callback function
-    thread_run: bool = None # Idicate if the task can still run (for use in a thread)
+    dl_setting: dict = None # Parameter dictionary
+    
+    callbacks: dict = None  # Callback dictionary
+    
+    use_thread: bool = None # Indicates if the script must launch the download in a separate thread
+    async_run: bool = None  # Idicate if the task can still run (for use in a thread)

    # Constructor
-    def __init__(self, path: str = None, headers: dict = None, messenger = None) -> None:
+    def __init__(self, path: str = None, headers: dict = None, asynchrone: bool = False,
+                    messenger = None, success = None, failure = None) -> None:
        """
            Constructor
-        => TODO
+        => It is important to initialize the WebPicDownloader object properly. The callback
+        functions can be initialized after the creation of the object.

-        * :path: -> Folder where the tool will download the images.
-        * :headers: -> Dictionary allowing to define the different parameters present in the header of the requests sent by WebPic.
+        * :path: -> Folder in which the tool will create the download folders and place the images.
+        * :headers: -> Dictionary allowing to define the different parameters present in the header
+            of the requests sent by WebPic.
+        * :asynchronous: -> True: launch the download in a thread, False: the opposite.
+        * :messenger: -> Callback function messenger (see setter).
+        * :success: -> Callback function success (see setter).
+        * :failure: -> Callback function failure (see setter).
        """
+        super().__init__()
        self.path = path if path else os.getcwd()
        self.headers = headers if headers else {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
        }
-        self.messenger = messenger if messenger else lambda msg: print(msg)
-        self.thread_run = True
+        self.dl_setting = {
+            'url': None,
+            'name': None
+        }
+
+        self.callbacks = {
+            'messenger': messenger if messenger else lambda msg: print(msg),
+            'success': success if success else lambda: print("Success!"),
+            'failure': failure if failure else lambda: print("failure!")
+        }
+        
+        self.use_thread = asynchrone
+        self.async_run = True

    # Internal functions
    def __get_html(self, url: str) -> str:
@ -125,71 +150,117 @@ class WebPicDownloader():
        else:
            raise ValueError("The folder already exists, it may already contain images")

+    def __msg(self, message: str) -> None:
+        """
+            Internal Function #do-not-use#
+        => Use the messenger callback to send a message.
+        """
+        self.callbacks.get('messenger')(message)
+
    # Public functions
+    def start_downloading(self, url: str, folder_name: str) -> None:
+        """
+        Start downloading all pictures of a website.
+        Depending on the configuration the download will either be started in the
+        main process or in a separate thread.
+        
+        * :url: -> The url of the website to annalyse.
+        * :folder_name: -> The name of the folder in which to upload the photos.
+        """
+        # Updating settings
+        self.dl_setting['url'] = url
+        self.dl_setting['name'] = folder_name
+
+        # Start processing
+        if self.use_thread:
+            self.async_run = True
+            self.start()
+        else:
+            self.run()
+
+    def stop_downloading(self) -> None:
+        """
+        Stops the download after the current item is processed and exit the downloading thread.
+        
+        <!> this function is available only if asynchronous option is set to True in constructor. <!>
+        """
+        if self.use_thread:
+            self.async_run = False
+        else:
+            raise Exception("Can not stop a no multi thread execution")
+
+    def run(self) -> None:
+        """
+        <!>
+        This function must not be used! It is an internal function called automatically
+        when the download task starts.
+        <!>
+        """
+        try:
+            # init variables
+            download_count = 0  # count to 0
+            download_folder = f"{self.path}/{self.dl_setting.get('name')}/"  # format path
+            website_html = self.__get_html(self.dl_setting.get('url'))  # website html
+            images = self.__find_all_img(website_html)  # find all img balises ing html
+
+            self.__initialize_folder(download_folder)  # initialize download folder
+            self.__msg(f"WebPicDownloader found {len(images)} images on the website.")
+
+            # process pictures
+            for i, img in enumerate(images):
+                if self.use_thread and not self.async_run:
+                    return
+                try:
+                    self.__msg(f"Start downloading image {i}.")
+                    img_link = self.__find_img_link(img)
+                    self.__download_img(img_link, f"{download_folder}image-{i}.{self.__find_image_type(img_link)}")
+                    self.__msg(f"Download of image {i}, done!")
+                    download_count += 1
+                except Exception as err:
+                    self.__msg(f"ERROR: Unable to process image {i} -> err[{err}].")
+            self.__msg(f"WebPicDownloader has processed {download_count} images out of {len(images)}.")
+            self.callbacks.get('success')()  # success, launch callback and return
+            return
+
+        except HTTPError as err:
+            self.__msg(f"ERROR: An http error occured -> err[{err}].")
+        except (ValueError, URLError) as err:
+            self.__msg(f"ERROT: An error occured with the url -> err[{err}].")
+        except Exception as err:
+            self.__msg(f"ERROR: An unknown error occured -> err[{err}]")
+        self.callbacks.get('failure')()  # failure, launch callback and exit
+
+    def set_success_callback(self, callback) -> None:
+        """
+        Setter to define the callback function when the download succeeded.
+
+        * :callback: -> the callback function to call when the download is a success.
+        """
+        self.callbacks['success'] = callback
+
+    def set_failure_callback(self, callback) -> None:
+        """
+        Setter to define the callback function called when the download fails.
+
+        * :callback: -> the callback function to call when the download is a failure.
+        """
+        self.callbacks['failure'] = callback
+
    def set_messenger_callback(self, callback) -> None:
        """
        Setter to define the callback function called when new messages arrive.

        * :callback: -> the callback function to call when a message event is emited.
        """
-        self.messenger = callback
-    
-    def stop(self) -> None:
-        """
-        Stop the downloading and processing of images (method for use in a thread).
-        """
-        self.thread_run = False
-        sys.exit(0)
-
-    def download(self, url: str, folder_name: str) -> bool:
-        """
-        Start downloading all pictures of a website
-        
-        * :url: -> The url of the website to annalyse.
-        * :folder_name: -> The name of the folder in which to upload the photos.
-        * RETURN -> True if success, False else.
-        """
-        try:
-            count = 0  # count to 0
-            folder_path = f"{self.path}/{folder_name}/"  # format path
-            html = self.__get_html(url)  # website html
-            images = self.__find_all_img(html)  # find all img balises ing html
-
-            self.thread_run = True  # set thread_run to true
-            self.__initialize_folder(folder_path)  # initialize formatted path
-            self.messenger(f"WebPicDownloader found {len(images)} images on the website.")
-
-            for i, img in enumerate(images):
-                print("check")
-                print(id(self))
-                if not self.thread_run:
-                    print("return")
-                try:
-                    self.messenger(f"Start downloading image {i}.")
-                    img_link = self.__find_img_link(img)
-                    self.__download_img(img_link, f"{folder_path}image-{i}.{self.__find_image_type(img_link)}")
-                    self.messenger(f"Download of image {i}, done!")
-                    count += 1
-                except Exception as err:
-                    self.messenger(f"ERROR: Unable to process image {i} -> err[{err}].")
-                    
-            self.messenger(f"WebPicDownloader has processed {count} images out of {len(images)}.")
-            return True
-        except HTTPError as err:
-            self.messenger(f"ERROR: An http error occured -> err[{err}].")
-        except (ValueError, URLError) as err:
-            self.messenger(f"ERROT: An error occured with the url -> err[{err}].")
-        except Exception as err:
-            self.messenger(f"ERROR: An unknown error occured -> err[{err}]")
-        return False
+        self.callbacks['messenger'] = callback

 if __name__ == "__main__":
+    # Internal entry point for testing and consol use.
    wpd = WebPicDownloader()
-    wpd.set_messenger_callback(lambda msg: print(f"--> {msg}"))
    while True:
        url = input("Website URL ? ")
        name = input("Folder name ? ")
-        wpd.download(url, name)
+        wpd.start_downloading(url, name)
        if "n" == input("Do you want to continue [Y/n] ? ").lower():
            break
    print("Good bye !")