Major bug fixes, optimization + major advance
This commit is contained in:
@@ -1,10 +1,31 @@
|
||||
import os
|
||||
from enum import Enum
|
||||
from threading import Semaphore, Thread
|
||||
from urllib import request
|
||||
from urllib.error import HTTPError, URLError
|
||||
from bs4 import BeautifulSoup, Tag, ResultSet
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
"""
|
||||
MessageType
|
||||
|
||||
Is an enumeration to define the different types of messages sent by the webpic messenger.
|
||||
|
||||
There are 3 types of messages.
|
||||
- log -> log
|
||||
- error -> err
|
||||
- success -> suc
|
||||
|
||||
@author Jérémi Nihart / EndMove
|
||||
@link https://git.endmove.eu/EndMove/WebPicDownloader
|
||||
@version 1.0.0
|
||||
@since 2022-09-05
|
||||
"""
|
||||
LOG = 'log'
|
||||
ERROR = 'err'
|
||||
SUCCESS = 'suc'
|
||||
|
||||
|
||||
class WebPicDownloader(Thread):
|
||||
"""
|
||||
WebPicDownloader
|
||||
@@ -12,19 +33,18 @@ class WebPicDownloader(Thread):
|
||||
Webpicdownloader is a scraping tool that allows you to browse a web page,
|
||||
find the images and download them. This tool is easily usable and implementable
|
||||
in an application. It has been designed to be executed in an integrated thread
|
||||
in an asynchronous way as well as more classically in a synchronous way. This
|
||||
tool allows to define 3 callback functions, one for events, one in case of
|
||||
success and one in case of failure. It also has an integrated entry point
|
||||
allowing it to be directly executed in terminal mode.
|
||||
in an asynchronous way. This tool allows to define 3 callback functions, one for
|
||||
events, one in case of success and one in case of failure. It also has an
|
||||
integrated entry point allowing it to be directly executed in terminal mode.
|
||||
|
||||
@author EndMove <contact@endmove.eu>
|
||||
@version 1.2.0
|
||||
@version 1.2.1
|
||||
"""
|
||||
# Variables
|
||||
__callbacks: dict = None # Callback dictionary
|
||||
__settings: dict = None #
|
||||
__dl_infos: dict = None #
|
||||
__sem: Semaphore = None #
|
||||
__settings: dict = None # Webpic basics settings
|
||||
__dl_infos: dict = None # Download informations
|
||||
__sem: Semaphore = None # Semaphore for the webpic worker
|
||||
|
||||
_exit: bool = None # When set to True quit the thread
|
||||
|
||||
@@ -39,7 +59,6 @@ class WebPicDownloader(Thread):
|
||||
* :path: -> Folder in which the tool will create the download folders and place the images.
|
||||
* :headers: -> Dictionary allowing to define the different parameters present in the header
|
||||
of the requests sent by WebPic.
|
||||
* :asynchronous: -> True: launch the download in a thread, False: the opposite.
|
||||
* :messenger: -> Callback function messenger (see setter).
|
||||
* :success: -> Callback function success (see setter).
|
||||
* :failure: -> Callback function failure (see setter).
|
||||
@@ -60,8 +79,6 @@ class WebPicDownloader(Thread):
|
||||
'website_url': 'url',
|
||||
'download_name': 'name',
|
||||
'download_path': 'full_path',
|
||||
'tot_image_count': 0,
|
||||
'dl_image_count': 0,
|
||||
'running': False
|
||||
}
|
||||
self.__sem = Semaphore(0)
|
||||
@@ -69,6 +86,7 @@ class WebPicDownloader(Thread):
|
||||
|
||||
self.start() # start deamon
|
||||
|
||||
|
||||
# Internal functions
|
||||
def __get_html(self, url: str) -> str:
|
||||
"""
|
||||
@@ -82,6 +100,7 @@ class WebPicDownloader(Thread):
|
||||
response = request.urlopen(req)
|
||||
return response.read().decode('utf-8')
|
||||
|
||||
|
||||
def __find_all_img(self, html: str) -> ResultSet:
|
||||
"""
|
||||
Internal Function #do-not-use#
|
||||
@@ -93,6 +112,7 @@ class WebPicDownloader(Thread):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup.find_all('img')
|
||||
|
||||
|
||||
def __find_img_link(self, img: Tag) -> str:
|
||||
"""
|
||||
Internal Function #do-not-use#
|
||||
@@ -115,6 +135,7 @@ class WebPicDownloader(Thread):
|
||||
raise ValueError("Bad image url")
|
||||
return link
|
||||
|
||||
|
||||
def __find_image_type(self, img_link: str) -> str:
|
||||
"""
|
||||
Internal Function #do-not-use#
|
||||
@@ -128,6 +149,7 @@ class WebPicDownloader(Thread):
|
||||
type = type.split('?')[0]
|
||||
return type
|
||||
|
||||
|
||||
def __download_img(self, url: str, filename: str) -> None:
|
||||
"""
|
||||
Internal Function #do-not-use#
|
||||
@@ -141,6 +163,7 @@ class WebPicDownloader(Thread):
|
||||
with open(filename, 'wb') as img:
|
||||
img.write(raw_img)
|
||||
|
||||
|
||||
def __initialize_folder(self, folder_path: str) -> None:
|
||||
"""
|
||||
Internal Function #do-not-use#
|
||||
@@ -154,12 +177,17 @@ class WebPicDownloader(Thread):
|
||||
else:
|
||||
raise ValueError("The folder already exists, it may already contain images")
|
||||
|
||||
def __msg(self, message: str) -> None:
|
||||
|
||||
def __msg(self, message: str, type:MessageType=MessageType.LOG) -> None:
|
||||
"""
|
||||
Internal Function #do-not-use#
|
||||
=> Use the messenger callback to send a message.
|
||||
|
||||
* :message: -> the message to send through callback
|
||||
* :type: -> message type, can be ['log', 'err', 'suc']
|
||||
"""
|
||||
self.__callbacks.get('messenger')(message)
|
||||
self.__callbacks.get('messenger')(message, type)
|
||||
|
||||
|
||||
# Public functions
|
||||
def set_success_callback(self, callback) -> None:
|
||||
@@ -170,6 +198,7 @@ class WebPicDownloader(Thread):
|
||||
"""
|
||||
self.__callbacks['success'] = callback
|
||||
|
||||
|
||||
def set_failure_callback(self, callback) -> None:
|
||||
"""
|
||||
Setter to define the callback function called when the download fails.
|
||||
@@ -178,6 +207,7 @@ class WebPicDownloader(Thread):
|
||||
"""
|
||||
self.__callbacks['failure'] = callback
|
||||
|
||||
|
||||
def set_messenger_callback(self, callback) -> None:
|
||||
"""
|
||||
Setter to define the callback function called when new messages arrive.
|
||||
@@ -186,74 +216,103 @@ class WebPicDownloader(Thread):
|
||||
"""
|
||||
self.__callbacks['messenger'] = callback
|
||||
|
||||
|
||||
def start_downloading(self, url: str, name: str) -> None:
|
||||
"""
|
||||
TODO desc
|
||||
Start downloading all pictures of a website.
|
||||
|
||||
* :url: -> The url of the website to annalyse.
|
||||
* :folder_name: -> The name of the folder in which to upload the photos.
|
||||
"""
|
||||
if self.__dl_infos.get('running'):
|
||||
print("bussy")
|
||||
if not self.is_alive:
|
||||
self.__msg("Opss, the download thread is not running, please restart webpic.", MessageType.ERROR)
|
||||
elif self.__dl_infos.get('running'):
|
||||
self.__msg("Opss, the download thread is busy.", MessageType.ERROR)
|
||||
else:
|
||||
self.__dl_infos['website_url'] = url
|
||||
self.__dl_infos['download_name'] = name
|
||||
self.__sem.release()
|
||||
|
||||
|
||||
def stop_downloading(self, block=False) -> None:
|
||||
"""
|
||||
TODO DESC
|
||||
Stops the download after the current item is processed and exit the downloading thread.
|
||||
|
||||
<!> Attention once called it will not be possible any more to download. <!>
|
||||
|
||||
* :block: -> If true, the function will block until the worker has finished working, if
|
||||
False(default value), the stop message will be thrown and the program will continue.
|
||||
"""
|
||||
self.__exit = True
|
||||
self.__sem.release()
|
||||
if block:
|
||||
self.join()
|
||||
|
||||
|
||||
def is_download_running(self) -> bool:
|
||||
"""
|
||||
Indique si un téléchargement est en cours
|
||||
|
||||
* RETURN -> True if yes, False else.
|
||||
"""
|
||||
return self.__dl_infos['running'];
|
||||
|
||||
|
||||
# Thread corp function
|
||||
def run(self) -> None:
|
||||
while True:
|
||||
self.__sem.acquire()
|
||||
self.__sem.acquire() # waiting the authorization to process
|
||||
|
||||
if self.__exit:
|
||||
if self.__exit: # check if the exiting is requested
|
||||
return
|
||||
|
||||
self.__dl_infos['running'] = True # reserv run
|
||||
self.__dl_infos['running'] = True # indicate that the thread is busy
|
||||
|
||||
try:
|
||||
# parse infos from url
|
||||
html = self.__get_html(self.__dl_infos.get('website_url')) # website html
|
||||
images = self.__find_all_img(html) # find all img balises ing html
|
||||
|
||||
self.__dl_infos['tot_image_count'] = len(images) # count total image
|
||||
self.__dl_infos['dl_image_count'] = 0 # set download count to 0
|
||||
# setting up download informaations
|
||||
tot_count = len(images) # count total image
|
||||
dl_count = 0 # set download count to 0
|
||||
self.__dl_infos['download_path'] = f"{self.__settings.get('root_path')}/{self.__dl_infos.get('download_name')}/" # format path
|
||||
|
||||
# init working directory
|
||||
self.__initialize_folder(self.__dl_infos.get('download_path')) # Init download folder
|
||||
self.__msg(f"WebPicDownloader found {self.__dl_infos.get('tot_image_count')} images on the website.")
|
||||
self.__msg(f"WebPicDownloader found {tot_count} images on the website.")
|
||||
|
||||
# process pictures
|
||||
# start images processing
|
||||
for i, img in enumerate(images):
|
||||
try:
|
||||
self.__msg(f"Start downloading image {i}.")
|
||||
|
||||
img_link = self.__find_img_link(img) # find image link
|
||||
self.__download_img(img_link, f"{self.__dl_infos.get('download_path')}image-{i}.{self.__find_image_type(img_link)}") # download the image
|
||||
|
||||
self.__msg(f"Download of image {i}, done!")
|
||||
self.__dl_infos['dl_image_count'] += 1 # increment download counter
|
||||
dl_count += 1 # increment download counter
|
||||
except Exception as err:
|
||||
self.__msg(f"ERROR: Unable to process image {i} -> err[{err}].")
|
||||
self.__msg(f"WebPicDownloader has processed {self.__dl_infos.get('dl_image_count')} images out of {self.__dl_infos.get('tot_image_count')}.")
|
||||
# end images processing
|
||||
|
||||
self.__msg(f"WebPicDownloader has processed {dl_count} images out of {tot_count}.")
|
||||
self.__callbacks.get('success')() # success, launch callback
|
||||
except Exception as err:
|
||||
self.__msg(f"ERROR: An error occured -> err[{err}]")
|
||||
self.__callbacks.get('failure')() # error, launch callback
|
||||
self.__dl_infos['running'] = False # free run
|
||||
|
||||
self.__dl_infos['running'] = False # inficate that the thread is free
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Internal entry point for testing and consol use.
|
||||
wpd = WebPicDownloader()
|
||||
def lol(msg):
|
||||
pass
|
||||
wpd.set_messenger_callback(lol)
|
||||
while True:
|
||||
url = input("Website URL ? ")
|
||||
name = input("Folder name ? ")
|
||||
wpd.start_downloading(url, name)
|
||||
if "n" == input("Do you want to continue [Y/n] ? ").lower():
|
||||
wpd.stop_downloading()
|
||||
break
|
||||
wpd.stop_downloading(block=True)
|
||||
print("Good bye !")
|
||||
Reference in New Issue
Block a user