import urllib from BeautifulSoup import BeautifulSoup import MySQLdb import MySQLdb.cursors MAX_PAGES = 200 MAX_PROFILES=1100 dbhost = "foo" dbport = 3306 dbuser = "foo" dbpass = "foo" dbname = "foo" class DuplicateEntry(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) def update_news(news_pages): dbconn = MySQLdb.connect(host = dbhost, port = dbport, user=dbuser, passwd=dbpass, db=dbname, cursorclass=MySQLdb.cursors.DictCursor) cursor = dbconn.cursor() try: for i in range(1, news_pages + 1): html = urllib.urlopen("http://digg.com/page%s" % i).read() print "Got page %s" % i soup = BeautifulSoup(html) stories = soup.findAll("div", { "class" : "news-details" }) for div in stories: try: story_href = div.find("a")['href'] username = div.find("img", { "class" : "user-photo" })['alt'] time = div.find("span", { "class" : "d", "property" : "dc:date" })['content'] try: cursor.execute(""" INSERT INTO digg SET user=%s, submission=%s, date=%s """, (username, story_href, time)) except: raise DuplicateEntry("Duplicate Entry") except DuplicateEntry: raise except: continue except DuplicateEntry: pass cursor.execute("""SELECT COUNT(*) num, user FROM digg GROUP BY user ORDER BY num DESC""") rows = cursor.fetchall() for row in rows: cursor.execute("""SELECT * FROM digg_users WHERE user=%s""", (row['user'])) user_row = cursor.fetchone() if user_row == None: cursor.execute("""INSERT INTO digg_users SET frontpage=%s, user=%s""", (row['num'], row['user'])) else: cursor.execute("""UPDATE digg_users SET frontpage=%s WHERE user=%s""", (row['num'], row['user'])) cursor.close() dbconn.commit() dbconn.close() def update_profiles(num_profiles): dbconn = MySQLdb.connect(host = dbhost, port = dbport, user=dbuser, passwd=dbpass, db=dbname, cursorclass=MySQLdb.cursors.DictCursor) cursor = dbconn.cursor() cursor.execute(""" SELECT user, frontpage + frontpagestatic AS popular FROM digg_users ORDER BY popular DESC LIMIT %s""" % num_profiles) rows = cursor.fetchall() i = 1 for row in rows: print "Updating %s: %s" % (i, row['user']) i += 1 html = urllib.urlopen("http://digg.com/users/%s" % row['user']).read() soup = BeautifulSoup(html) if 1 == 1: try: stats = soup.find("ul", { "class" : "stats" }) user = row['user'] image = soup.find("img", { "class" : "photo" })['src'] stats_em = stats.findAll("em") dugg = stats_em[0].string.replace(",","") submitted = stats_em[2].string.replace(",","") popular = stats_em[3].string.replace(",","") views = stats_em[6].string.replace(",","") cursor.execute(""" UPDATE digg_users SET frontpagetotal=%s, dugg=%s, profileViews=%s, image=%s, submitted=%s WHERE user=%s""", (popular, dugg, views, image, submitted, user)) except: pass cursor.close() dbconn.commit() dbconn.close() if __name__ == "__main__": update_news(MAX_PAGES) update_profiles(MAX_PROFILES)