Coding

Python Learning: Automate Boring Stuff with Python | Chapter 15: Scheduled Web Comic Downloader

Book

The world is indeed comic, but the joke is on mankind.

H. P. Lovecraft

true.. true.. 6:23:15 PM


I made different functions to handle a specific (one) task; Division of labor

[sourcecode language=”python”]</p> <p>#usage python download_lefthand_comic.py<br> import logging, os, requests, bs4,time, re</p> <p>logging.basicConfig(level=logging.DEBUG, format="%(asctime)s – %(levelname)s – %(message)s")</p> <p>comic_directory = os.path.join("c:\\", "comicbooks", "lefthandedtoons")<br> comic_directory = os.path.abspath(comic_directory)</p> <p>base_url = "http://www.lefthandedtoons.com"</p> <p>def handle_each_comic(soup: bs4.BeautifulSoup):<br> comic_image = soup.find(class_="comicimage")<br> if comic_image:<br> img_src = comic_image.get("src")<br> comic_name = soup.find(class_="comictitle").find("a")<br> ext_expression = re.compile(r"\.(\w{3})$")<br> comic_name = comic_name.get_text()<br> logging.info("Comic: {} has src {}".format(comic_name, img_src))<br> comic_image_ext = ext_expression.search(img_src)<br> comic_image_ext = (comic_image_ext.groups()[0])</p> <p> #download image<br> if not os.path.exists(comic_directory):<br> os.makedirs(comic_directory)<br> logging.info("Comic Directory Created: " + comic_directory)<br> try:<br> res = requests.get(img_src)<br> res.raise_for_status()<br> #create new file<br> try:<br> filename = comic_name + "." + comic_image_ext<br> filename = os.path.abspath(os.path.join(comic_directory, filename))<br> logging.info("downloading: " + filename)<br> comic_file = open(filename, "wb")<br> for chunk in res.iter_content(chunk_size=1024):<br> comic_file.write(chunk)<br> logging.info("Download Complete")<br> comic_file.close()<br> except Exception as file_opening_err:<br> logging.error("Error while opening file " + str( file_opening_err))<br> except Exception as err:<br> logging.error("Error while downloading image: " + str( err ))<br> else:<br> logging.warning("comic Image not found")<br> return True</p> <p>def get_last_update():<br> filename = os.path.join(comic_directory, "last_update.txt")<br> filename = os.path.abspath(filename)<br> if not os.path.exists(filename):<br> return 0<br> file = open(filename)<br> content = file.read()<br> logging.info("Last update: " + content )<br> return int(content)</p> <p>def last_update(last_comic: int):<br> filename = os.path.join(comic_directory, "last_update.txt")<br> filename = os.path.abspath(filename)<br> file = open(filename, "w")<br> file.write(str(last_comic))<br> file.close()<br> logging.info("Last comic updated: " + str(last_comic) )<br> return</p> <p>def download_comic(start: int=0):<br> comic_url = base_url<br> if not start == None and start &gt; 0:<br> comic_url += "/" + str(start)<br> logging.info("Comic Url updated: " + comic_url)<br> else:<br> comic_url += "/1"<br> logging.info("Downloading from the very beginning")<br> #make request to comic<br> try:<br> res = requests.get(comic_url)<br> res.raise_for_status()<br> content = res.text<br> comicSoup = bs4.BeautifulSoup(content, "html.parser")<br> handle_each_comic(comicSoup)<br> last_update_exp = re.compile(r"/(\d+)/?$")<br> try:<br> next_button = comicSoup.find(class_="next")<br> if next_button:<br> next_button = (next_button.find("a"))["href"]<br> comic_url = base_url + next_button<br> logging.info("The next download is: " + comic_url)<br> i = 0<br> while comic_url and i &lt; 100:<br> res = requests.get(comic_url)<br> res.raise_for_status()<br> content = res.text<br> comicSoup = bs4.BeautifulSoup(content, "html.parser")<br> handle_each_comic(comicSoup)<br> next_button = comicSoup.find(class_="next")<br> next_button = (next_button.find("a"))["href"]<br> comic_url = base_url + next_button<br> logging.info("The next download is: " + comic_url)<br> logging.info("-"*20)<br> i+= 1<br> except Exception as err:<br> print(str(err))</p> <p> except Exception as reqerror:<br> logging.error("Error while requesting for comic: " + str(reqerror))<br> last_update_num = last_update_exp.search(comic_url)<br> print(comic_url)<br> if last_update_num:<br> last_update_num = last_update_num.groups()[0]<br> last_update(int(last_update_num))<br> logging.info("No next button. Download complete")<br> #TODO: CHECK FOR LAST UPDATE<br> #TODO: DOWNLOAD COMIC IMAGES STARTING FROM LAST UPDATE + 1<br> #TODO: UPDATE LAST UPDATE<br> #TODO:NOTIFY USERS OF DOWNLOAD<br> #TODO:USE THE WINDOWS SCHEDULER TO RUN SCRIPT ONCE A DAY</p> <p>if __name__ == "__main__":<br> logging.info("Starting Download….: @ " + str(time.time() ))<br> start = get_last_update()<br> if start &lt;= 0:<br> start = None<br> download_comic(start)</p> <p>[/sourcecode]

Please follow and like us:
0

Enjoy this blog? Please spread the word :)

error: Content is protected !!