Major bug fixes, optimization + major advance

This commit is contained in:
2022-09-05 23:13:38 +02:00
parent 539b75cb09
commit f8f7832dd7
11 changed files with 322 additions and 161 deletions

View File

@@ -1,10 +1,31 @@
import os
from enum import Enum
from threading import Semaphore, Thread
from urllib import request
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup, Tag, ResultSet
class MessageType(Enum):
"""
MessageType
Is an enumeration to define the different types of messages sent by the webpic messenger.
There are 3 types of messages.
- log -> log
- error -> err
- success -> suc
@author Jérémi Nihart / EndMove
@link https://git.endmove.eu/EndMove/WebPicDownloader
@version 1.0.0
@since 2022-09-05
"""
LOG = 'log'
ERROR = 'err'
SUCCESS = 'suc'
class WebPicDownloader(Thread):
"""
WebPicDownloader
@@ -12,19 +33,18 @@ class WebPicDownloader(Thread):
Webpicdownloader is a scraping tool that allows you to browse a web page,
find the images and download them. This tool is easily usable and implementable
in an application. It has been designed to be executed in an integrated thread
in an asynchronous way as well as more classically in a synchronous way. This
tool allows to define 3 callback functions, one for events, one in case of
success and one in case of failure. It also has an integrated entry point
allowing it to be directly executed in terminal mode.
in an asynchronous way. This tool allows to define 3 callback functions, one for
events, one in case of success and one in case of failure. It also has an
integrated entry point allowing it to be directly executed in terminal mode.
@author EndMove <contact@endmove.eu>
@version 1.2.0
@version 1.2.1
"""
# Variables
__callbacks: dict = None # Callback dictionary
__settings: dict = None #
__dl_infos: dict = None #
__sem: Semaphore = None #
__settings: dict = None # Webpic basics settings
__dl_infos: dict = None # Download informations
__sem: Semaphore = None # Semaphore for the webpic worker
_exit: bool = None # When set to True quit the thread
@@ -39,7 +59,6 @@ class WebPicDownloader(Thread):
* :path: -> Folder in which the tool will create the download folders and place the images.
* :headers: -> Dictionary allowing to define the different parameters present in the header
of the requests sent by WebPic.
* :asynchronous: -> True: launch the download in a thread, False: the opposite.
* :messenger: -> Callback function messenger (see setter).
* :success: -> Callback function success (see setter).
* :failure: -> Callback function failure (see setter).
@@ -60,8 +79,6 @@ class WebPicDownloader(Thread):
'website_url': 'url',
'download_name': 'name',
'download_path': 'full_path',
'tot_image_count': 0,
'dl_image_count': 0,
'running': False
}
self.__sem = Semaphore(0)
@@ -69,6 +86,7 @@ class WebPicDownloader(Thread):
self.start() # start deamon
# Internal functions
def __get_html(self, url: str) -> str:
"""
@@ -82,6 +100,7 @@ class WebPicDownloader(Thread):
response = request.urlopen(req)
return response.read().decode('utf-8')
def __find_all_img(self, html: str) -> ResultSet:
"""
Internal Function #do-not-use#
@@ -93,6 +112,7 @@ class WebPicDownloader(Thread):
soup = BeautifulSoup(html, 'html.parser')
return soup.find_all('img')
def __find_img_link(self, img: Tag) -> str:
"""
Internal Function #do-not-use#
@@ -115,6 +135,7 @@ class WebPicDownloader(Thread):
raise ValueError("Bad image url")
return link
def __find_image_type(self, img_link: str) -> str:
"""
Internal Function #do-not-use#
@@ -128,6 +149,7 @@ class WebPicDownloader(Thread):
type = type.split('?')[0]
return type
def __download_img(self, url: str, filename: str) -> None:
"""
Internal Function #do-not-use#
@@ -141,6 +163,7 @@ class WebPicDownloader(Thread):
with open(filename, 'wb') as img:
img.write(raw_img)
def __initialize_folder(self, folder_path: str) -> None:
"""
Internal Function #do-not-use#
@@ -154,12 +177,17 @@ class WebPicDownloader(Thread):
else:
raise ValueError("The folder already exists, it may already contain images")
def __msg(self, message: str) -> None:
def __msg(self, message: str, type:MessageType=MessageType.LOG) -> None:
"""
Internal Function #do-not-use#
=> Use the messenger callback to send a message.
* :message: -> the message to send through callback
* :type: -> message type, can be ['log', 'err', 'suc']
"""
self.__callbacks.get('messenger')(message)
self.__callbacks.get('messenger')(message, type)
# Public functions
def set_success_callback(self, callback) -> None:
@@ -170,6 +198,7 @@ class WebPicDownloader(Thread):
"""
self.__callbacks['success'] = callback
def set_failure_callback(self, callback) -> None:
"""
Setter to define the callback function called when the download fails.
@@ -178,6 +207,7 @@ class WebPicDownloader(Thread):
"""
self.__callbacks['failure'] = callback
def set_messenger_callback(self, callback) -> None:
"""
Setter to define the callback function called when new messages arrive.
@@ -186,74 +216,103 @@ class WebPicDownloader(Thread):
"""
self.__callbacks['messenger'] = callback
def start_downloading(self, url: str, name: str) -> None:
"""
TODO desc
Start downloading all pictures of a website.
* :url: -> The url of the website to annalyse.
* :folder_name: -> The name of the folder in which to upload the photos.
"""
if self.__dl_infos.get('running'):
print("bussy")
if not self.is_alive:
self.__msg("Opss, the download thread is not running, please restart webpic.", MessageType.ERROR)
elif self.__dl_infos.get('running'):
self.__msg("Opss, the download thread is busy.", MessageType.ERROR)
else:
self.__dl_infos['website_url'] = url
self.__dl_infos['download_name'] = name
self.__sem.release()
def stop_downloading(self, block=False) -> None:
"""
TODO DESC
Stops the download after the current item is processed and exit the downloading thread.
<!> Attention once called it will not be possible any more to download. <!>
* :block: -> If true, the function will block until the worker has finished working, if
False(default value), the stop message will be thrown and the program will continue.
"""
self.__exit = True
self.__sem.release()
if block:
self.join()
def is_download_running(self) -> bool:
"""
Indique si un téléchargement est en cours
* RETURN -> True if yes, False else.
"""
return self.__dl_infos['running'];
# Thread corp function
def run(self) -> None:
while True:
self.__sem.acquire()
self.__sem.acquire() # waiting the authorization to process
if self.__exit:
if self.__exit: # check if the exiting is requested
return
self.__dl_infos['running'] = True # reserv run
self.__dl_infos['running'] = True # indicate that the thread is busy
try:
# parse infos from url
html = self.__get_html(self.__dl_infos.get('website_url')) # website html
images = self.__find_all_img(html) # find all img balises ing html
self.__dl_infos['tot_image_count'] = len(images) # count total image
self.__dl_infos['dl_image_count'] = 0 # set download count to 0
# setting up download informaations
tot_count = len(images) # count total image
dl_count = 0 # set download count to 0
self.__dl_infos['download_path'] = f"{self.__settings.get('root_path')}/{self.__dl_infos.get('download_name')}/" # format path
# init working directory
self.__initialize_folder(self.__dl_infos.get('download_path')) # Init download folder
self.__msg(f"WebPicDownloader found {self.__dl_infos.get('tot_image_count')} images on the website.")
self.__msg(f"WebPicDownloader found {tot_count} images on the website.")
# process pictures
# start images processing
for i, img in enumerate(images):
try:
self.__msg(f"Start downloading image {i}.")
img_link = self.__find_img_link(img) # find image link
self.__download_img(img_link, f"{self.__dl_infos.get('download_path')}image-{i}.{self.__find_image_type(img_link)}") # download the image
self.__msg(f"Download of image {i}, done!")
self.__dl_infos['dl_image_count'] += 1 # increment download counter
dl_count += 1 # increment download counter
except Exception as err:
self.__msg(f"ERROR: Unable to process image {i} -> err[{err}].")
self.__msg(f"WebPicDownloader has processed {self.__dl_infos.get('dl_image_count')} images out of {self.__dl_infos.get('tot_image_count')}.")
# end images processing
self.__msg(f"WebPicDownloader has processed {dl_count} images out of {tot_count}.")
self.__callbacks.get('success')() # success, launch callback
except Exception as err:
self.__msg(f"ERROR: An error occured -> err[{err}]")
self.__callbacks.get('failure')() # error, launch callback
self.__dl_infos['running'] = False # free run
self.__dl_infos['running'] = False # inficate that the thread is free
if __name__ == "__main__":
# Internal entry point for testing and consol use.
wpd = WebPicDownloader()
def lol(msg):
pass
wpd.set_messenger_callback(lol)
while True:
url = input("Website URL ? ")
name = input("Folder name ? ")
wpd.start_downloading(url, name)
if "n" == input("Do you want to continue [Y/n] ? ").lower():
wpd.stop_downloading()
break
wpd.stop_downloading(block=True)
print("Good bye !")