Python Learning: Automate Boring Stuff with Python | Chapter 15: Scheduled Web Comic Downloader

 

The world is indeed comic, but the joke is on mankind.

H. P. Lovecraft

true.. true.. 6:23:15 PM


I made different functions to handle a specific (one) task; Division of labor

 

# usage python download_lefthand_comic.py
import logging
import os
import requests
import bs4
import time
import re

# TODO: CHECK FOR LAST UPDATE
# TODO: DOWNLOAD COMIC IMAGES STARTING FROM LAST UPDATE + 1
# TODO: UPDATE LAST UPDATE
# TODO:NOTIFY USERS OF DOWNLOAD
# #TODO:USE THE WINDOWS SCHEDULER TO RUN SCRIPT ONCE A DAY
logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s - %(levelname)s - %(message)s")
comic_directory = os.path.join(".", "comicbooks", "lefthandedtoons")
comic_directory = os.path.abspath(comic_directory)
base_url = "http://www.lefthandedtoons.com"


def handle_each_comic(soup: bs4.BeautifulSoup):
    comic_image = soup.find(class_="comicimage")
    if comic_image:
        img_src = comic_image.get("src")
        comic_name = soup.find(class_="comictitle").find("a")
        ext_expression = re.compile(r"\.(\w{3})$")
        comic_name = comic_name.get_text()
        logging.info("Comic: {} has src {}".format(comic_name, img_src))
        comic_image_ext = ext_expression.search(img_src)
        comic_image_ext = (comic_image_ext.groups()[0])
        # download image
        if not os.path.exists(comic_directory):
            os.makedirs(comic_directory)
            logging.info("Comic Directory Created: " + comic_directory)
        try:
            res = requests.get(img_src)
            res.raise_for_status()
            # create new file
            try:
                filename = comic_name + "." + comic_image_ext
                filename = os.path.abspath(
                    os.path.join(comic_directory, filename))
                logging.info("downloading: " + filename)
                comic_file = open(filename, "wb")
                for chunk in res.iter_content(chunk_size=1024):
                    comic_file.write(chunk)
                    logging.info("Download Complete")
                comic_file.close()
            except Exception as file_opening_err:
                logging.error("Error while opening file " +
                                str(file_opening_err))
        except Exception as err:
            logging.error("Error while downloading image: " + str(err))
    else:
        logging.warning("comic Image not found")
    return True


def get_last_update():
    filename = os.path.join(comic_directory, "last_update.txt")
    filename = os.path.abspath(filename)
    if not os.path.exists(filename):
        return 0
    file = open(filename)
    content = file.read()
    logging.info("Last update: " + content)
    return int(content)


def last_update(last_comic: int):
    filename = os.path.join(comic_directory, "last_update.txt")
    filename = os.path.abspath(filename)
    file = open(filename, "w")
    file.write(str(last_comic))
    file.close()
    logging.info("Last comic updated: " + str(last_comic))
    return


def download_comic(start: int = 0):
    comic_url = base_url
    if not start == None and start > 0:
        comic_url += "/" + str(start)
        logging.info("Comic Url updated: " + comic_url)
    else:
        comic_url += "/1"
        logging.info("Downloading from the very beginning")
        # make request to comic
    try:
        res = requests.get(comic_url)
        res.raise_for_status()
        content = res.text
        comicSoup = bs4.BeautifulSoup(content, "html.parser")
        handle_each_comic(comicSoup)
        last_update_exp = re.compile(r"/(\d+)/?$")
        try:
            next_button = comicSoup.find(class_="next")
            if next_button:
                next_button = (next_button.find("a"))["href"]
                comic_url = base_url + next_button
                logging.info("The next download is: " + comic_url)
                i = 0
                while comic_url and i < 100:
                    res = requests.get(comic_url)
                    res.raise_for_status()
                    content = res.text
                    comicSoup = bs4.BeautifulSoup(content, "html.parser")
                    handle_each_comic(comicSoup)
                    next_button = comicSoup.find(class_="next")
                    next_button = (next_button.find("a"))["href"]
                    comic_url = base_url + next_button
                    logging.info("The next download is: " + comic_url)
                    logging.info("-"*20)
                    i += 1
                last_update_num = last_update_exp.search(comic_url)
                print(comic_url)
                if last_update_num:
                    last_update_num = last_update_num.groups()[0]
                    last_update(int(last_update_num))
                    logging.info("No next button. Download complete")
        except Exception as err:
            print(str(err))
    except Exception as reqerror:
        logging.error("Error while requesting for comic: " + str(reqerror))

    return None


if __name__ == "__main__":
    logging.info("Starting Download....: @ " + str(time.time()))
    start = get_last_update()
    if start >= 0:
        start = None
        download_comic(start)