Downloading a comic…. 9% 12:39:32 PM


There is a solution provided in the book but this is my way of going around the task

 

#comic saver - saves each post of a comic page
#USAGE python comic.py 

import logging, requests, bs4, os, sys
logging.basicConfig(level=logging.DEBUG, format=" %(asctime)s - %(levelname)s - %(message)s")
#TODO: get url

if len( sys.argv ) == 1:
    #comic_url = sys.argv[1]
    comic_url = "http://xkcd.com"
    try:
        #TODO: make request to url
        res = requests.get(comic_url)
        res.raise_for_status()
        comic_soup = bs4.BeautifulSoup(res.text,"html.parser" )
        logging.info("Soup created ")
        prev_button = comic_soup.select("a[rel='prev']")[0]
        prev_button_href = prev_button.get("href")
        logging.info("Prev button obtained")
        iterator = 0
        limit = 100
        directory = None
        if not os.path.exists(os.path.abspath("comic")):
            directory = os.mkdir("comic")
        else:
            directory = os.path.abspath("comic")
        while not prev_button_href.endswith("#") and iterator < limit: #TODO: get image in comic comic_image = comic_soup.select("#comic > img")
            assert comic_image, "Image should exist"
            if comic_image:
                comic_image = comic_image[0]
                comic_image_url = comic_image.get("src")
            comic_title =comic_soup.select("#ctitle")
            comic_title = comic_title[0].get_text()

            try:
                #TODO: download image
                comic_res = requests.get(comic_url + comic_image_url)
                comic_res.raise_for_status()
                #TODO: create a file
                comic_file = open(os.path.join(directory, comic_title + ".jpg" ), "wb")
                logging.info("Comic File created as {}".format(comic_title))
                for chunk in comic_res.iter_content(10000):
                    comic_file.write(chunk)
                    #close file
                comic_file.close()
            except Exception as err:
                logging.error("Comic Download Error: " + str( err ))

            #TODO: switch to previous page
            res = requests.get(comic_url + prev_button_href)
            #TODO: res set soup
            comic_soup = bs4.BeautifulSoup(res.text,"html.parser" )
            #TODO: set prev_button
            prev_button = comic_soup.select("a[rel='prev']")
            if prev_button:
                prev_button = prev_button[0]
                prev_button_href = prev_button.get("href")
                logging.info("Page has been changed")
            iterator+= 1
    except Exception as err:
        logging.error(str(err))

else:
    logging.error("Script requires a ")

#TODO: loop over the previous button and make request
#TODO: save image in html
#TODO repeat till no previous button is found

After 2 minutes my Comic directory looked like this:

Just pythoning comic books…

See you soon!

%d bloggers like this: