Python Learning: Automate Boring Stuff with Python | Chapter 15: Scheduled Web Comic Downloader

The world is indeed comic, but the joke is on mankind.

H. P. Lovecraft

true.. true.. 6:23:15 PM


I made different functions to handle a specific (one) task; Division of labor






#usage python download_lefthand_comic.py
import logging, os, requests, bs4,time, re

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")

comic_directory = os.path.join("c:\\", "comicbooks", "lefthandedtoons")
comic_directory = os.path.abspath(comic_directory)

base_url =  "http://www.lefthandedtoons.com"

def handle_each_comic(soup: bs4.BeautifulSoup):
	comic_image = soup.find(class_="comicimage")
	if comic_image:
		img_src = comic_image.get("src")
		comic_name = soup.find(class_="comictitle").find("a")
		ext_expression = re.compile(r"\.(\w{3})$")
		comic_name = comic_name.get_text()
		logging.info("Comic: {} has src {}".format(comic_name, img_src))
		comic_image_ext = ext_expression.search(img_src)
		comic_image_ext = (comic_image_ext.groups()[0])
		
		#download image
		if not os.path.exists(comic_directory):
			os.makedirs(comic_directory)
			logging.info("Comic Directory Created: " + comic_directory)
		try:
			res = requests.get(img_src)
			res.raise_for_status()
			#create new file 
			try:
				filename = comic_name + "." + comic_image_ext
				filename = os.path.abspath(os.path.join(comic_directory, filename))
				logging.info("downloading: " +  filename)
				comic_file = open(filename, "wb")
				for chunk in res.iter_content(chunk_size=1024):
					comic_file.write(chunk)
				logging.info("Download Complete")
				comic_file.close()
			except Exception as file_opening_err:
				logging.error("Error while opening file " + str( file_opening_err))
		except Exception as err:
			logging.error("Error while downloading image: " + str(  err ))
	else:
		logging.warning("comic Image not found")
	return True
	
	
	
	

def get_last_update():
	filename = os.path.join(comic_directory, "last_update.txt")
	filename = os.path.abspath(filename)
	if not os.path.exists(filename):
		return 0
	file = open(filename)
	content = file.read()
	logging.info("Last update: " + content )
	return int(content)
	
	
	
	
def last_update(last_comic: int):
	filename = os.path.join(comic_directory, "last_update.txt")
	filename = os.path.abspath(filename)
	file = open(filename, "w")
	file.write(str(last_comic))
	file.close()
	logging.info("Last comic updated: " + str(last_comic) )
	return

def download_comic(start: int=0):
	comic_url = base_url
	if not start == None and start > 0:
		comic_url += "/" + str(start)
		logging.info("Comic Url updated: " + comic_url)
	else:
		comic_url += "/1"
		logging.info("Downloading from the very beginning")
	#make request to comic 
	try:
		res = requests.get(comic_url)
		res.raise_for_status()
		content = res.text
		comicSoup = bs4.BeautifulSoup(content, "html.parser")
		handle_each_comic(comicSoup)
		last_update_exp = re.compile(r"/(\d+)/?$")
		try: 
			next_button = comicSoup.find(class_="next")
			if next_button:
				next_button = (next_button.find("a"))["href"]
				comic_url = base_url + next_button
				logging.info("The next download is: "   + comic_url) 
				i  = 0
				while comic_url and i < 100:
					res = requests.get(comic_url)
					res.raise_for_status()
					content = res.text
					comicSoup = bs4.BeautifulSoup(content, "html.parser")
					handle_each_comic(comicSoup)
					next_button = comicSoup.find(class_="next")
					next_button = (next_button.find("a"))["href"]
					comic_url = base_url + next_button
					logging.info("The next download is: "   + comic_url)	
					logging.info("-"*20)
					i+= 1
		except Exception as err:
			print(str(err))
		
	except Exception as reqerror:
		logging.error("Error while requesting for comic: " + str(reqerror))
	last_update_num = last_update_exp.search(comic_url)
	print(comic_url)
	if last_update_num:
		last_update_num = last_update_num.groups()[0]
		last_update(int(last_update_num))
	logging.info("No next button. Download complete")
#TODO: CHECK FOR LAST UPDATE 
#TODO: DOWNLOAD COMIC IMAGES STARTING FROM LAST UPDATE + 1
#TODO: UPDATE LAST UPDATE 
#TODO:NOTIFY USERS OF DOWNLOAD 
#TODO:USE THE WINDOWS SCHEDULER TO RUN SCRIPT ONCE A DAY


	
	
if __name__ == "__main__":
	logging.info("Starting Download....: @ " + str(time.time() ))
	start = get_last_update()
	if start <= 0:
		start = None
	download_comic(start)