青空文庫API(アルファ版)を作ってみた
HTMLスクレイピングの勉強と自分の検索用をかねて、青空文庫DBを生成してみました。
とりあえずDB作っとけば何か面白いことがやれそうだなぁ…と思いつつも、今のところすぐにサービスインできそうなモノも作れていないので…とりあえずエイヤッっとAPIとして公開してしまおうかなと(´・ω・`)
http://plasticscafe.sakura.ne.jp/aozora/
現状はJSONのみの提供で、でた形式とかもあまり深く考えていない適当仕様です。こういうデータが欲しいとか形式はこれが良い等のご希望があればお知らせください。あと不具合あったら連絡おば…(´・ω・`)…おねがいします。
実装とかそこらへん
実装はPythonで、HTMLのパースはBeautifulSoupを使っております。とりあえずまったく整理できていないコードはこんな感じです。青空文庫様が提供している全作品CSVのファイルを舐めて情報を取得していく感じですネ。あまり青空文庫側には負担をかけないように頑張っておりますが…どうだろう。頑張ってはいます、迷惑かけてたらごめんなさい直しますほんとごめんなさい(´・ω・`)
#!/usr/bin/python # -*- coding: utf-8 -*- import os import re import urllib2 from zipfile import ZipFile import csv import time import shutil import sqlite3 import json from BeautifulSoup import BeautifulSoup work_dir = os.path.dirname(os.path.abspath(__file__)) + '/' aozora_url = 'http://www.aozora.gr.jp/' url= '' zip_url = aozora_url + 'index_pages/list_person_all.zip' article_url = aozora_url + 'cards/' user_agent = 'libestation' tmp_dir = 'tmp/' cache_dir = 'cache/' zipfile = 'list_person_all.zip' db_dir = 'db/' db_file = 'aozora.db' db_path = work_dir + db_dir + db_file ranking_url = 'http://www.aozora.gr.jp/access_ranking/' process_num = 500 import sys import codecs sys.stdout = codecs.getwriter('utf_8')(sys.stdout) class AozoraParser: # get list zip file def getZip(self): zip = self.getHttp(zip_url) localfile = open(work_dir + tmp_dir + zipfile,'wb') localfile.write(zip) localfile.close() # unZip list zip def unZip(self): zf = ZipFile(work_dir + tmp_dir + zipfile, 'r') filenames = zf.namelist() if 0 < len(filenames): filename = filenames[0] uzf = file(work_dir + tmp_dir + filename, 'wb') uzf.write(zf.read(filename)) uzf.close() return filename return None # parse csv file def parseCSV(self, csvfile): cf = open(work_dir + tmp_dir + csvfile) n = 0 m = 0 con = self.initDB() for row in csv.reader(cf): if 0 < n: # artist infomation artist_id = row[0] artist_name = unicode(row[1], 'shift-jis') artist_url = aozora_url + 'index_pages/person' + str(int(artist_id)) + '.html' # article information article_id = row[2] article_name = unicode(row[3], 'shift-jis') article_url = aozora_url + 'cards/' + artist_id + '/card' + str(int(article_id)) + '.html' article_public_date = row[9] print(artist_name + ':' + article_name + ':' + article_url) sql = """ SELECT id FROM articles WHERE id = '""" + article_id + """' LIMIT 1 """ rows = con.execute(sql) if rows.fetchone() == None: # GET ZIP URL result = self.getArticleInfo(article_url) if result != None: (zip_url, html_url, status, article_kana, artist_kana) = result print('get zip url:' + zip_url) else: (zip_url, html_url, status, article_kana, artist_kana) = ('','', 2, '', '') # save artist info sql = """ SELECT id, status FROM artists WHERE id = '""" + artist_id + """' LIMIT 1 """ rows = con.execute(sql) row_artist = rows.fetchone() if row_artist == None: sql = """ INSERT INTO artists (id, name, kana, url, status) VALUES ('""" + artist_id + """','""" + artist_name + """', '""" + artist_kana + """','""" + artist_url + """', """ + str(status) + """) """ con.execute(sql) elif row_artist[1] == 2: sql = """ UPDATE artists SET status = """ + str(status) + """ WHERE id = '""" + artist_id + """' """ con.execute(sql) # save article info sql = """ INSERT INTO articles (id, name, kana, url, zip_url, html_url, status, public_date, artist_id) VALUES ('""" + article_id + """','""" + article_name + """', '""" + article_kana + """','""" + article_url + """', '""" + zip_url + """','""" + html_url + """', """ + str(status) + """,'"""+ article_public_date + """', '"""+ artist_id+ """') """ con.execute(sql) if process_num < m: break m += 1 con.commit() n += 1 # close db insert con.close() # Create Ranking Result def createRankingFile(self, category=None): sql = """ SELECT ac.id, ac.name, ast.id, ast.name, r.count, r.category FROM rankings AS r LEFT JOIN articles AS ac ON r.article_id = ac.id LEFT JOIN artists AS ast ON ast.id = ac.artist_id """ if category == 'html': search_sql = ' WHERE category = 0 ORDER BY count DESC' sql = sql + search_sql file_name_postfix = '_' + category elif category == 'text': search_sql = ' WHERE category = 1 ORDER BY count DESC' sql = sql + search_sql file_name_postfix = '_' + category else: sql = """ SELECT article_id, ac.name, ast.id, ast.name, sum(count) AS c, 2 FROM rankings AS r LEFT JOIN articles AS ac ON r.article_id = ac.id LEFT JOIN artists AS ast ON ast.id = ac.artist_id GROUP BY r.article_id, ac.id, ac.name, ast.name ORDER BY c DESC """ file_name_postfix = '' con = sqlite3.connect(db_path) rows = con.execute(sql).fetchall() con.close() # format records results = [] for row in rows: results.append({ 'article_id':row[0], 'article_name':row[1], 'artist_id':row[2], 'artist_name':row[3], 'count':row[4] #'category':row[5] }) file_name = 'ranking' + file_name_postfix f = open(work_dir + cache_dir + file_name + '.json', 'w') f.write(json.dumps(results)) f.close() return True ################################# # DB Setting ################################# def initDB(self): if os.path.exists(db_path): shutil.copy(db_path, db_path + '.old') con = sqlite3.connect(db_path) # Delete Old Data sql = """ DROP TABLE IF EXISTS artists """ #con.execute(sql) sql = """ DROP TABLE IF EXISTS articles """ #con.execute(sql) sql = """ DROP TABLE IF EXISTS rankings """ con.execute(sql) # Create New Table sql = """ CREATE TABLE IF NOT EXISTS artists ( id TEXT, name TEXT, kana TEXT, url TEXT, status INTEGER ) """ con.execute(sql) sql = """ CREATE TABLE IF NOT EXISTS articles ( id TEXT, name TEXT, kana TEXT, url TEXT, zip_url TEXT, html_url TEXT, artist_id TEXT, public_date DATE, status INTEGER ) """ con.execute(sql) sql = """ CREATE TABLE rankings ( article_id TEXT, count INTEGER, category INTEGER ) """ con.execute(sql) sql = """ create index ranking_article_id ON rankings(article_id) """ con.execute(sql) sql = """ create index ranking_count ON rankings(count) """ con.execute(sql) sql = """ create index ranking_category ON rankings(category) """ con.execute(sql) return con ################################# # Scrape HTML Data ################################# def getArticleInfo(self, url): time.sleep(3) html = self.getHttp(url) if html != None: soup = BeautifulSoup(html) urls = soup.findAll('table', {'class':'download'})[0].findAll('a') artist_kana = soup.findAll('table')[2].findAll('td')[5].contents[0] article_kana = soup.findAll('table')[0].findAll('td')[3].contents[0] status = 0 if soup.find('div', {'class':'copyright'}) != None: status = 1 if 2 <= len(urls): return (urls[0]['href'][2:], urls[1]['href'][2:], status, article_kana, artist_kana) return None # Access Ranking def parseAccessRanking(self): html = self.getHttp(ranking_url) if html != None: soup = BeautifulSoup(html) urls = soup.findAll('a') con = sqlite3.connect(db_path) html_rank_url = ranking_url + urls[0]['href'] self.saveAccessRanking(html_rank_url, con, 0) txt_rank_url = ranking_url + urls[1]['href'] self.saveAccessRanking(txt_rank_url, con, 1) con.close() def saveAccessRanking(self, url, db, category=0): html = self.getHttp(url) if html != None: soup = BeautifulSoup(html) trs = soup.findAll('tr') n = 0 for tr in trs: if 0 < n: article_url = tr.find('a')['href'] sql = """ SELECT id FROM articles WHERE url='""" + article_url + """' LIMIT 1 """ rows = db.execute(sql) row = rows.fetchone() if row != None: access_count = tr.findAll('td')[3].contents[0] sql = """ INSERT INTO rankings (article_id, count, category) VALUES ('""" + row[0] + """',""" + access_count + """, """ + str(category) + """) """ db.execute(sql) n += 1 db.commit() ################################# # Utility ################################# # HTTP Getter def getHttp(self, url): opener = urllib2.build_opener() opener.addheaders = [('User-agent', user_agent)] try: http = opener.open(url) result = http.read() http.close() return result except urllib2.HTTPError, e: return None #################################################### # Main Process def main(): ap = AozoraParser() ap.getZip() csvfile = ap.unZip() ap.parseCSV(csvfile) ap.parseAccessRanking() time.sleep(120) ap.createRankingFile() ap.createRankingFile('text') ap.createRankingFile('html') #################################################### # Main Setting if __name__ == "__main__": main()
とりあえずさくらインターネットのサーバがメゲルので一定時間に500件ずつ処理するようになってます。それはともかく、結構無駄な感じの部分も多いので全体的に見直さないとイカンなぁ…精進します(´・ω・`)