The world is indeed comic, but the joke is on mankind.
H. P. Lovecraft
true.. true.. 6:23:15 PM
I made different functions to handle a specific (one) task; Division of labor
# usage python download_lefthand_comic.py import logging import os import requests import bs4 import time import re # TODO: CHECK FOR LAST UPDATE # TODO: DOWNLOAD COMIC IMAGES STARTING FROM LAST UPDATE + 1 # TODO: UPDATE LAST UPDATE # TODO:NOTIFY USERS OF DOWNLOAD # #TODO:USE THE WINDOWS SCHEDULER TO RUN SCRIPT ONCE A DAY logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") comic_directory = os.path.join(".", "comicbooks", "lefthandedtoons") comic_directory = os.path.abspath(comic_directory) base_url = "http://www.lefthandedtoons.com" def handle_each_comic(soup: bs4.BeautifulSoup): comic_image = soup.find(class_="comicimage") if comic_image: img_src = comic_image.get("src") comic_name = soup.find(class_="comictitle").find("a") ext_expression = re.compile(r"\.(\w{3})$") comic_name = comic_name.get_text() logging.info("Comic: {} has src {}".format(comic_name, img_src)) comic_image_ext = ext_expression.search(img_src) comic_image_ext = (comic_image_ext.groups()[0]) # download image if not os.path.exists(comic_directory): os.makedirs(comic_directory) logging.info("Comic Directory Created: " + comic_directory) try: res = requests.get(img_src) res.raise_for_status() # create new file try: filename = comic_name + "." + comic_image_ext filename = os.path.abspath( os.path.join(comic_directory, filename)) logging.info("downloading: " + filename) comic_file = open(filename, "wb") for chunk in res.iter_content(chunk_size=1024): comic_file.write(chunk) logging.info("Download Complete") comic_file.close() except Exception as file_opening_err: logging.error("Error while opening file " + str(file_opening_err)) except Exception as err: logging.error("Error while downloading image: " + str(err)) else: logging.warning("comic Image not found") return True def get_last_update(): filename = os.path.join(comic_directory, "last_update.txt") filename = os.path.abspath(filename) if not os.path.exists(filename): return 0 file = open(filename) content = file.read() logging.info("Last update: " + content) return int(content) def last_update(last_comic: int): filename = os.path.join(comic_directory, "last_update.txt") filename = os.path.abspath(filename) file = open(filename, "w") file.write(str(last_comic)) file.close() logging.info("Last comic updated: " + str(last_comic)) return def download_comic(start: int = 0): comic_url = base_url if not start == None and start > 0: comic_url += "/" + str(start) logging.info("Comic Url updated: " + comic_url) else: comic_url += "/1" logging.info("Downloading from the very beginning") # make request to comic try: res = requests.get(comic_url) res.raise_for_status() content = res.text comicSoup = bs4.BeautifulSoup(content, "html.parser") handle_each_comic(comicSoup) last_update_exp = re.compile(r"/(\d+)/?$") try: next_button = comicSoup.find(class_="next") if next_button: next_button = (next_button.find("a"))["href"] comic_url = base_url + next_button logging.info("The next download is: " + comic_url) i = 0 while comic_url and i < 100: res = requests.get(comic_url) res.raise_for_status() content = res.text comicSoup = bs4.BeautifulSoup(content, "html.parser") handle_each_comic(comicSoup) next_button = comicSoup.find(class_="next") next_button = (next_button.find("a"))["href"] comic_url = base_url + next_button logging.info("The next download is: " + comic_url) logging.info("-"*20) i += 1 last_update_num = last_update_exp.search(comic_url) print(comic_url) if last_update_num: last_update_num = last_update_num.groups()[0] last_update(int(last_update_num)) logging.info("No next button. Download complete") except Exception as err: print(str(err)) except Exception as reqerror: logging.error("Error while requesting for comic: " + str(reqerror)) return None if __name__ == "__main__": logging.info("Starting Download....: @ " + str(time.time())) start = get_last_update() if start >= 0: start = None download_comic(start)