Patch WebPicDownloading tool -> multithread integration
This commit is contained in:
parent
27b05b6184
commit
63e67772a3
2
main.py
2
main.py
@ -30,7 +30,7 @@ if __name__ == '__main__':
|
|||||||
config = get_config()
|
config = get_config()
|
||||||
|
|
||||||
# Create utli/model
|
# Create utli/model
|
||||||
webpic = WebPicDownloader()
|
webpic = WebPicDownloader(path=config.get('app_folder'), asynchrone=True)
|
||||||
|
|
||||||
# Create app controllers
|
# Create app controllers
|
||||||
main_controller = MainController(config)
|
main_controller = MainController(config)
|
||||||
|
@ -1,44 +1,69 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import threading
|
||||||
from urllib import request
|
from urllib import request
|
||||||
from urllib.error import HTTPError, URLError
|
from urllib.error import HTTPError, URLError
|
||||||
from bs4 import BeautifulSoup, Tag, ResultSet
|
from bs4 import BeautifulSoup, Tag, ResultSet
|
||||||
|
|
||||||
|
|
||||||
class WebPicDownloader():
|
class WebPicDownloader(threading.Thread):
|
||||||
"""
|
"""
|
||||||
WebPicDownloader
|
WebPicDownloader
|
||||||
|
|
||||||
webpicdownloader is a simple tool able to find and download all pictures on a webpage.
|
Webpicdownloader is a scraping tool that allows you to browse a web page,
|
||||||
This tool is customizable and allows to define the working folder, the headers present
|
find the images and download them. This tool is easily usable and implementable
|
||||||
in the http requests which will be emitted, a call back function named messenger which
|
in an application. It has been designed to be executed in an integrated thread
|
||||||
will be called at each event, and in addition the script is thread safe and allows to
|
in an asynchronous way as well as more classically in a synchronous way. This
|
||||||
be stopped in the middle of treatment with a simple call to the stop function.
|
tool allows to define 3 callback functions, one for events, one in case of
|
||||||
|
success and one in case of failure. It also has an integrated entry point
|
||||||
|
allowing it to be directly executed in terminal mode.
|
||||||
|
|
||||||
@author EndMove <contact@endmove.eu>
|
@author EndMove <contact@endmove.eu>
|
||||||
@version 1.1.1
|
@version 1.2.0
|
||||||
"""
|
"""
|
||||||
# Variables
|
# Variables
|
||||||
path: str = None # Main working folder directory
|
path: str = None # Main working folder directory
|
||||||
headers: dict = None # Header parameters
|
headers: dict = None # Header parameters
|
||||||
messenger = None # Event callback function
|
dl_setting: dict = None # Parameter dictionary
|
||||||
thread_run: bool = None # Idicate if the task can still run (for use in a thread)
|
|
||||||
|
callbacks: dict = None # Callback dictionary
|
||||||
|
|
||||||
|
use_thread: bool = None # Indicates if the script must launch the download in a separate thread
|
||||||
|
async_run: bool = None # Idicate if the task can still run (for use in a thread)
|
||||||
|
|
||||||
# Constructor
|
# Constructor
|
||||||
def __init__(self, path: str = None, headers: dict = None, messenger = None) -> None:
|
def __init__(self, path: str = None, headers: dict = None, asynchrone: bool = False,
|
||||||
|
messenger = None, success = None, failure = None) -> None:
|
||||||
"""
|
"""
|
||||||
Constructor
|
Constructor
|
||||||
=> TODO
|
=> It is important to initialize the WebPicDownloader object properly. The callback
|
||||||
|
functions can be initialized after the creation of the object.
|
||||||
|
|
||||||
* :path: -> Folder where the tool will download the images.
|
* :path: -> Folder in which the tool will create the download folders and place the images.
|
||||||
* :headers: -> Dictionary allowing to define the different parameters present in the header of the requests sent by WebPic.
|
* :headers: -> Dictionary allowing to define the different parameters present in the header
|
||||||
|
of the requests sent by WebPic.
|
||||||
|
* :asynchronous: -> True: launch the download in a thread, False: the opposite.
|
||||||
|
* :messenger: -> Callback function messenger (see setter).
|
||||||
|
* :success: -> Callback function success (see setter).
|
||||||
|
* :failure: -> Callback function failure (see setter).
|
||||||
"""
|
"""
|
||||||
|
super().__init__()
|
||||||
self.path = path if path else os.getcwd()
|
self.path = path if path else os.getcwd()
|
||||||
self.headers = headers if headers else {
|
self.headers = headers if headers else {
|
||||||
'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
self.messenger = messenger if messenger else lambda msg: print(msg)
|
self.dl_setting = {
|
||||||
self.thread_run = True
|
'url': None,
|
||||||
|
'name': None
|
||||||
|
}
|
||||||
|
|
||||||
|
self.callbacks = {
|
||||||
|
'messenger': messenger if messenger else lambda msg: print(msg),
|
||||||
|
'success': success if success else lambda: print("Success!"),
|
||||||
|
'failure': failure if failure else lambda: print("failure!")
|
||||||
|
}
|
||||||
|
|
||||||
|
self.use_thread = asynchrone
|
||||||
|
self.async_run = True
|
||||||
|
|
||||||
# Internal functions
|
# Internal functions
|
||||||
def __get_html(self, url: str) -> str:
|
def __get_html(self, url: str) -> str:
|
||||||
@ -125,71 +150,117 @@ class WebPicDownloader():
|
|||||||
else:
|
else:
|
||||||
raise ValueError("The folder already exists, it may already contain images")
|
raise ValueError("The folder already exists, it may already contain images")
|
||||||
|
|
||||||
|
def __msg(self, message: str) -> None:
|
||||||
|
"""
|
||||||
|
Internal Function #do-not-use#
|
||||||
|
=> Use the messenger callback to send a message.
|
||||||
|
"""
|
||||||
|
self.callbacks.get('messenger')(message)
|
||||||
|
|
||||||
# Public functions
|
# Public functions
|
||||||
|
def start_downloading(self, url: str, folder_name: str) -> None:
|
||||||
|
"""
|
||||||
|
Start downloading all pictures of a website.
|
||||||
|
Depending on the configuration the download will either be started in the
|
||||||
|
main process or in a separate thread.
|
||||||
|
|
||||||
|
* :url: -> The url of the website to annalyse.
|
||||||
|
* :folder_name: -> The name of the folder in which to upload the photos.
|
||||||
|
"""
|
||||||
|
# Updating settings
|
||||||
|
self.dl_setting['url'] = url
|
||||||
|
self.dl_setting['name'] = folder_name
|
||||||
|
|
||||||
|
# Start processing
|
||||||
|
if self.use_thread:
|
||||||
|
self.async_run = True
|
||||||
|
self.start()
|
||||||
|
else:
|
||||||
|
self.run()
|
||||||
|
|
||||||
|
def stop_downloading(self) -> None:
|
||||||
|
"""
|
||||||
|
Stops the download after the current item is processed and exit the downloading thread.
|
||||||
|
|
||||||
|
<!> this function is available only if asynchronous option is set to True in constructor. <!>
|
||||||
|
"""
|
||||||
|
if self.use_thread:
|
||||||
|
self.async_run = False
|
||||||
|
else:
|
||||||
|
raise Exception("Can not stop a no multi thread execution")
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
"""
|
||||||
|
<!>
|
||||||
|
This function must not be used! It is an internal function called automatically
|
||||||
|
when the download task starts.
|
||||||
|
<!>
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# init variables
|
||||||
|
download_count = 0 # count to 0
|
||||||
|
download_folder = f"{self.path}/{self.dl_setting.get('name')}/" # format path
|
||||||
|
website_html = self.__get_html(self.dl_setting.get('url')) # website html
|
||||||
|
images = self.__find_all_img(website_html) # find all img balises ing html
|
||||||
|
|
||||||
|
self.__initialize_folder(download_folder) # initialize download folder
|
||||||
|
self.__msg(f"WebPicDownloader found {len(images)} images on the website.")
|
||||||
|
|
||||||
|
# process pictures
|
||||||
|
for i, img in enumerate(images):
|
||||||
|
if self.use_thread and not self.async_run:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self.__msg(f"Start downloading image {i}.")
|
||||||
|
img_link = self.__find_img_link(img)
|
||||||
|
self.__download_img(img_link, f"{download_folder}image-{i}.{self.__find_image_type(img_link)}")
|
||||||
|
self.__msg(f"Download of image {i}, done!")
|
||||||
|
download_count += 1
|
||||||
|
except Exception as err:
|
||||||
|
self.__msg(f"ERROR: Unable to process image {i} -> err[{err}].")
|
||||||
|
self.__msg(f"WebPicDownloader has processed {download_count} images out of {len(images)}.")
|
||||||
|
self.callbacks.get('success')() # success, launch callback and return
|
||||||
|
return
|
||||||
|
|
||||||
|
except HTTPError as err:
|
||||||
|
self.__msg(f"ERROR: An http error occured -> err[{err}].")
|
||||||
|
except (ValueError, URLError) as err:
|
||||||
|
self.__msg(f"ERROT: An error occured with the url -> err[{err}].")
|
||||||
|
except Exception as err:
|
||||||
|
self.__msg(f"ERROR: An unknown error occured -> err[{err}]")
|
||||||
|
self.callbacks.get('failure')() # failure, launch callback and exit
|
||||||
|
|
||||||
|
def set_success_callback(self, callback) -> None:
|
||||||
|
"""
|
||||||
|
Setter to define the callback function when the download succeeded.
|
||||||
|
|
||||||
|
* :callback: -> the callback function to call when the download is a success.
|
||||||
|
"""
|
||||||
|
self.callbacks['success'] = callback
|
||||||
|
|
||||||
|
def set_failure_callback(self, callback) -> None:
|
||||||
|
"""
|
||||||
|
Setter to define the callback function called when the download fails.
|
||||||
|
|
||||||
|
* :callback: -> the callback function to call when the download is a failure.
|
||||||
|
"""
|
||||||
|
self.callbacks['failure'] = callback
|
||||||
|
|
||||||
def set_messenger_callback(self, callback) -> None:
|
def set_messenger_callback(self, callback) -> None:
|
||||||
"""
|
"""
|
||||||
Setter to define the callback function called when new messages arrive.
|
Setter to define the callback function called when new messages arrive.
|
||||||
|
|
||||||
* :callback: -> the callback function to call when a message event is emited.
|
* :callback: -> the callback function to call when a message event is emited.
|
||||||
"""
|
"""
|
||||||
self.messenger = callback
|
self.callbacks['messenger'] = callback
|
||||||
|
|
||||||
def stop(self) -> None:
|
|
||||||
"""
|
|
||||||
Stop the downloading and processing of images (method for use in a thread).
|
|
||||||
"""
|
|
||||||
self.thread_run = False
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
def download(self, url: str, folder_name: str) -> bool:
|
|
||||||
"""
|
|
||||||
Start downloading all pictures of a website
|
|
||||||
|
|
||||||
* :url: -> The url of the website to annalyse.
|
|
||||||
* :folder_name: -> The name of the folder in which to upload the photos.
|
|
||||||
* RETURN -> True if success, False else.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
count = 0 # count to 0
|
|
||||||
folder_path = f"{self.path}/{folder_name}/" # format path
|
|
||||||
html = self.__get_html(url) # website html
|
|
||||||
images = self.__find_all_img(html) # find all img balises ing html
|
|
||||||
|
|
||||||
self.thread_run = True # set thread_run to true
|
|
||||||
self.__initialize_folder(folder_path) # initialize formatted path
|
|
||||||
self.messenger(f"WebPicDownloader found {len(images)} images on the website.")
|
|
||||||
|
|
||||||
for i, img in enumerate(images):
|
|
||||||
print("check")
|
|
||||||
print(id(self))
|
|
||||||
if not self.thread_run:
|
|
||||||
print("return")
|
|
||||||
try:
|
|
||||||
self.messenger(f"Start downloading image {i}.")
|
|
||||||
img_link = self.__find_img_link(img)
|
|
||||||
self.__download_img(img_link, f"{folder_path}image-{i}.{self.__find_image_type(img_link)}")
|
|
||||||
self.messenger(f"Download of image {i}, done!")
|
|
||||||
count += 1
|
|
||||||
except Exception as err:
|
|
||||||
self.messenger(f"ERROR: Unable to process image {i} -> err[{err}].")
|
|
||||||
|
|
||||||
self.messenger(f"WebPicDownloader has processed {count} images out of {len(images)}.")
|
|
||||||
return True
|
|
||||||
except HTTPError as err:
|
|
||||||
self.messenger(f"ERROR: An http error occured -> err[{err}].")
|
|
||||||
except (ValueError, URLError) as err:
|
|
||||||
self.messenger(f"ERROT: An error occured with the url -> err[{err}].")
|
|
||||||
except Exception as err:
|
|
||||||
self.messenger(f"ERROR: An unknown error occured -> err[{err}]")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# Internal entry point for testing and consol use.
|
||||||
wpd = WebPicDownloader()
|
wpd = WebPicDownloader()
|
||||||
wpd.set_messenger_callback(lambda msg: print(f"--> {msg}"))
|
|
||||||
while True:
|
while True:
|
||||||
url = input("Website URL ? ")
|
url = input("Website URL ? ")
|
||||||
name = input("Folder name ? ")
|
name = input("Folder name ? ")
|
||||||
wpd.download(url, name)
|
wpd.start_downloading(url, name)
|
||||||
if "n" == input("Do you want to continue [Y/n] ? ").lower():
|
if "n" == input("Do you want to continue [Y/n] ? ").lower():
|
||||||
break
|
break
|
||||||
print("Good bye !")
|
print("Good bye !")
|
Reference in New Issue
Block a user