The world is indeed comic, but the joke is on mankind.
H. P. Lovecraft
true.. true.. 6:23:15 PM
I made different functions to handle a specific (one) task; Division of labor
# usage python download_lefthand_comic.py
import logging
import os
import requests
import bs4
import time
import re
# TODO: CHECK FOR LAST UPDATE
# TODO: DOWNLOAD COMIC IMAGES STARTING FROM LAST UPDATE + 1
# TODO: UPDATE LAST UPDATE
# TODO:NOTIFY USERS OF DOWNLOAD
# #TODO:USE THE WINDOWS SCHEDULER TO RUN SCRIPT ONCE A DAY
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s - %(levelname)s - %(message)s")
comic_directory = os.path.join(".", "comicbooks", "lefthandedtoons")
comic_directory = os.path.abspath(comic_directory)
base_url = "http://www.lefthandedtoons.com"
def handle_each_comic(soup: bs4.BeautifulSoup):
comic_image = soup.find(class_="comicimage")
if comic_image:
img_src = comic_image.get("src")
comic_name = soup.find(class_="comictitle").find("a")
ext_expression = re.compile(r"\.(\w{3})$")
comic_name = comic_name.get_text()
logging.info("Comic: {} has src {}".format(comic_name, img_src))
comic_image_ext = ext_expression.search(img_src)
comic_image_ext = (comic_image_ext.groups()[0])
# download image
if not os.path.exists(comic_directory):
os.makedirs(comic_directory)
logging.info("Comic Directory Created: " + comic_directory)
try:
res = requests.get(img_src)
res.raise_for_status()
# create new file
try:
filename = comic_name + "." + comic_image_ext
filename = os.path.abspath(
os.path.join(comic_directory, filename))
logging.info("downloading: " + filename)
comic_file = open(filename, "wb")
for chunk in res.iter_content(chunk_size=1024):
comic_file.write(chunk)
logging.info("Download Complete")
comic_file.close()
except Exception as file_opening_err:
logging.error("Error while opening file " +
str(file_opening_err))
except Exception as err:
logging.error("Error while downloading image: " + str(err))
else:
logging.warning("comic Image not found")
return True
def get_last_update():
filename = os.path.join(comic_directory, "last_update.txt")
filename = os.path.abspath(filename)
if not os.path.exists(filename):
return 0
file = open(filename)
content = file.read()
logging.info("Last update: " + content)
return int(content)
def last_update(last_comic: int):
filename = os.path.join(comic_directory, "last_update.txt")
filename = os.path.abspath(filename)
file = open(filename, "w")
file.write(str(last_comic))
file.close()
logging.info("Last comic updated: " + str(last_comic))
return
def download_comic(start: int = 0):
comic_url = base_url
if not start == None and start > 0:
comic_url += "/" + str(start)
logging.info("Comic Url updated: " + comic_url)
else:
comic_url += "/1"
logging.info("Downloading from the very beginning")
# make request to comic
try:
res = requests.get(comic_url)
res.raise_for_status()
content = res.text
comicSoup = bs4.BeautifulSoup(content, "html.parser")
handle_each_comic(comicSoup)
last_update_exp = re.compile(r"/(\d+)/?$")
try:
next_button = comicSoup.find(class_="next")
if next_button:
next_button = (next_button.find("a"))["href"]
comic_url = base_url + next_button
logging.info("The next download is: " + comic_url)
i = 0
while comic_url and i < 100:
res = requests.get(comic_url)
res.raise_for_status()
content = res.text
comicSoup = bs4.BeautifulSoup(content, "html.parser")
handle_each_comic(comicSoup)
next_button = comicSoup.find(class_="next")
next_button = (next_button.find("a"))["href"]
comic_url = base_url + next_button
logging.info("The next download is: " + comic_url)
logging.info("-"*20)
i += 1
last_update_num = last_update_exp.search(comic_url)
print(comic_url)
if last_update_num:
last_update_num = last_update_num.groups()[0]
last_update(int(last_update_num))
logging.info("No next button. Download complete")
except Exception as err:
print(str(err))
except Exception as reqerror:
logging.error("Error while requesting for comic: " + str(reqerror))
return None
if __name__ == "__main__":
logging.info("Starting Download....: @ " + str(time.time()))
start = get_last_update()
if start >= 0:
start = None
download_comic(start)