Coding

Python Learning: Automate Boring Stuff with Python | Chapter 15: Scheduled Web Comic Downloader

Book

The world is indeed comic, but the joke is on mankind.

H. P. Lovecraft

true.. true.. 6:23:15 PM


I made different functions to handle a specific (one) task; Division of labor

</p>
<p>#usage python download_lefthand_comic.py<br>
import logging, os, requests, bs4,time, re</p>
<p>logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")</p>
<p>comic_directory = os.path.join("c:\\", "comicbooks", "lefthandedtoons")<br>
comic_directory = os.path.abspath(comic_directory)</p>
<p>base_url =  "http://www.lefthandedtoons.com"</p>
<p>def handle_each_comic(soup: bs4.BeautifulSoup):<br>
	comic_image = soup.find(class_="comicimage")<br>
	if comic_image:<br>
		img_src = comic_image.get("src")<br>
		comic_name = soup.find(class_="comictitle").find("a")<br>
		ext_expression = re.compile(r"\.(\w{3})$")<br>
		comic_name = comic_name.get_text()<br>
		logging.info("Comic: {} has src {}".format(comic_name, img_src))<br>
		comic_image_ext = ext_expression.search(img_src)<br>
		comic_image_ext = (comic_image_ext.groups()[0])</p>
<p>		#download image<br>
		if not os.path.exists(comic_directory):<br>
			os.makedirs(comic_directory)<br>
			logging.info("Comic Directory Created: " + comic_directory)<br>
		try:<br>
			res = requests.get(img_src)<br>
			res.raise_for_status()<br>
			#create new file<br>
			try:<br>
				filename = comic_name + "." + comic_image_ext<br>
				filename = os.path.abspath(os.path.join(comic_directory, filename))<br>
				logging.info("downloading: " +  filename)<br>
				comic_file = open(filename, "wb")<br>
				for chunk in res.iter_content(chunk_size=1024):<br>
					comic_file.write(chunk)<br>
				logging.info("Download Complete")<br>
				comic_file.close()<br>
			except Exception as file_opening_err:<br>
				logging.error("Error while opening file " + str( file_opening_err))<br>
		except Exception as err:<br>
			logging.error("Error while downloading image: " + str(  err ))<br>
	else:<br>
		logging.warning("comic Image not found")<br>
	return True</p>
<p>def get_last_update():<br>
	filename = os.path.join(comic_directory, "last_update.txt")<br>
	filename = os.path.abspath(filename)<br>
	if not os.path.exists(filename):<br>
		return 0<br>
	file = open(filename)<br>
	content = file.read()<br>
	logging.info("Last update: " + content )<br>
	return int(content)</p>
<p>def last_update(last_comic: int):<br>
	filename = os.path.join(comic_directory, "last_update.txt")<br>
	filename = os.path.abspath(filename)<br>
	file = open(filename, "w")<br>
	file.write(str(last_comic))<br>
	file.close()<br>
	logging.info("Last comic updated: " + str(last_comic) )<br>
	return</p>
<p>def download_comic(start: int=0):<br>
	comic_url = base_url<br>
	if not start == None and start &gt; 0:<br>
		comic_url += "/" + str(start)<br>
		logging.info("Comic Url updated: " + comic_url)<br>
	else:<br>
		comic_url += "/1"<br>
		logging.info("Downloading from the very beginning")<br>
	#make request to comic<br>
	try:<br>
		res = requests.get(comic_url)<br>
		res.raise_for_status()<br>
		content = res.text<br>
		comicSoup = bs4.BeautifulSoup(content, "html.parser")<br>
		handle_each_comic(comicSoup)<br>
		last_update_exp = re.compile(r"/(\d+)/?$")<br>
		try:<br>
			next_button = comicSoup.find(class_="next")<br>
			if next_button:<br>
				next_button = (next_button.find("a"))["href"]<br>
				comic_url = base_url + next_button<br>
				logging.info("The next download is: "   + comic_url)<br>
				i  = 0<br>
				while comic_url and i &lt; 100:<br>
					res = requests.get(comic_url)<br>
					res.raise_for_status()<br>
					content = res.text<br>
					comicSoup = bs4.BeautifulSoup(content, "html.parser")<br>
					handle_each_comic(comicSoup)<br>
					next_button = comicSoup.find(class_="next")<br>
					next_button = (next_button.find("a"))["href"]<br>
					comic_url = base_url + next_button<br>
					logging.info("The next download is: "   + comic_url)<br>
					logging.info("-"*20)<br>
					i+= 1<br>
		except Exception as err:<br>
			print(str(err))</p>
<p>	except Exception as reqerror:<br>
		logging.error("Error while requesting for comic: " + str(reqerror))<br>
	last_update_num = last_update_exp.search(comic_url)<br>
	print(comic_url)<br>
	if last_update_num:<br>
		last_update_num = last_update_num.groups()[0]<br>
		last_update(int(last_update_num))<br>
	logging.info("No next button. Download complete")<br>
#TODO: CHECK FOR LAST UPDATE<br>
#TODO: DOWNLOAD COMIC IMAGES STARTING FROM LAST UPDATE + 1<br>
#TODO: UPDATE LAST UPDATE<br>
#TODO:NOTIFY USERS OF DOWNLOAD<br>
#TODO:USE THE WINDOWS SCHEDULER TO RUN SCRIPT ONCE A DAY</p>
<p>if __name__ == "__main__":<br>
	logging.info("Starting Download....: @ " + str(time.time() ))<br>
	start = get_last_update()<br>
	if start &lt;= 0:<br>
		start = None<br>
	download_comic(start)</p>
<p>
error: Content is protected !!