青空文庫API(アルファ版)を作ってみた

HTMLスクレイピングの勉強と自分の検索用をかねて、青空文庫DBを生成してみました。

とりあえずDB作っとけば何か面白いことがやれそうだなぁ…と思いつつも、今のところすぐにサービスインできそうなモノも作れていないので…とりあえずエイヤッっとAPIとして公開してしまおうかなと(´・ω・`)

http://plasticscafe.sakura.ne.jp/aozora/

現状はJSONのみの提供で、でた形式とかもあまり深く考えていない適当仕様です。こういうデータが欲しいとか形式はこれが良い等のご希望があればお知らせください。あと不具合あったら連絡おば…(´・ω・`)…おねがいします。

実装とかそこらへん

実装はPythonで、HTMLのパースはBeautifulSoupを使っております。とりあえずまったく整理できていないコードはこんな感じです。青空文庫様が提供している全作品CSVのファイルを舐めて情報を取得していく感じですネ。あまり青空文庫側には負担をかけないように頑張っておりますが…どうだろう。頑張ってはいます、迷惑かけてたらごめんなさい直しますほんとごめんなさい(´・ω・`)

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import re
import urllib2
from zipfile import ZipFile
import csv
import time
import shutil
import sqlite3
import json
from BeautifulSoup import BeautifulSoup

work_dir = os.path.dirname(os.path.abspath(__file__)) + '/'
aozora_url = 'http://www.aozora.gr.jp/'
url= ''
zip_url = aozora_url + 'index_pages/list_person_all.zip'
article_url = aozora_url + 'cards/'
user_agent = 'libestation'
tmp_dir = 'tmp/'
cache_dir = 'cache/'
zipfile = 'list_person_all.zip'
db_dir = 'db/'
db_file = 'aozora.db'
db_path = work_dir + db_dir + db_file
ranking_url = 'http://www.aozora.gr.jp/access_ranking/'
process_num = 500

import sys
import codecs
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)

class AozoraParser:
  # get list zip file
  def getZip(self):
    zip = self.getHttp(zip_url)
    localfile = open(work_dir + tmp_dir + zipfile,'wb')
    localfile.write(zip)
    localfile.close()

  # unZip list zip
  def unZip(self):
    zf = ZipFile(work_dir + tmp_dir + zipfile, 'r')
    filenames = zf.namelist()
    if 0 < len(filenames):
      filename = filenames[0]
      uzf = file(work_dir + tmp_dir + filename, 'wb')
      uzf.write(zf.read(filename))
      uzf.close()
      return filename
    return None
  
  # parse csv file
  def parseCSV(self, csvfile):
    cf = open(work_dir + tmp_dir + csvfile)
    n = 0
    m = 0
    con = self.initDB()
    for row in csv.reader(cf):
      if 0 < n: 
        # artist infomation
        artist_id = row[0]
        artist_name = unicode(row[1], 'shift-jis')
        artist_url = aozora_url + 'index_pages/person' + str(int(artist_id)) + '.html'
        # article information
        article_id = row[2]
        article_name = unicode(row[3], 'shift-jis')
        article_url = aozora_url + 'cards/' + artist_id + '/card' + str(int(article_id)) + '.html'
        article_public_date = row[9]
        print(artist_name + ':' + article_name + ':' + article_url) 
         
        sql = """
        SELECT id FROM articles WHERE id = '""" + article_id + """' LIMIT 1
        """
        rows = con.execute(sql) 
        if rows.fetchone() == None:
          # GET ZIP URL 
          result = self.getArticleInfo(article_url)
          if result != None:
            (zip_url, html_url, status, article_kana, artist_kana) = result
            print('get zip url:' + zip_url)
          else:
            (zip_url, html_url, status, article_kana, artist_kana) = ('','', 2, '', '')
          # save artist info
          sql = """
          SELECT id, status FROM artists WHERE id = '""" + artist_id + """' LIMIT 1
          """
          rows = con.execute(sql) 
          row_artist = rows.fetchone()
          if row_artist == None:
            sql = """
            INSERT INTO artists (id, name, kana, url, status) 
            VALUES ('""" + artist_id + """','""" + artist_name + """',
            '""" + artist_kana + """','""" + artist_url + """',
            """ + str(status)  + """) 
            """
            con.execute(sql)
          elif row_artist[1] == 2: 
            sql = """
            UPDATE artists SET status = """ + str(status) + """ 
            WHERE id = '""" + artist_id + """'
            """
            con.execute(sql)
          # save article info
          sql = """
          INSERT INTO articles 
          (id, name, kana, url, zip_url, html_url,
          status, public_date, artist_id) 
          VALUES ('""" + article_id + """','""" + article_name + """',
          '""" + article_kana + """','""" + article_url + """',
          '""" + zip_url + """','""" + html_url + """',
          """ + str(status) + """,'"""+ article_public_date + """',
          '"""+ artist_id+ """') 
          """
          con.execute(sql) 
          if process_num < m:
            break
          m += 1
        con.commit()
      n += 1
    # close db insert
    con.close()
  # Create Ranking Result
  def createRankingFile(self, category=None):
    sql = """
    SELECT ac.id, ac.name, ast.id, ast.name, r.count, r.category
    FROM rankings AS r 
    LEFT JOIN articles AS ac ON r.article_id = ac.id 
    LEFT JOIN artists AS ast ON ast.id = ac.artist_id
    """ 
    if category == 'html':
      search_sql = ' WHERE category = 0 ORDER BY count DESC'
      sql = sql + search_sql
      file_name_postfix = '_' + category
    elif category == 'text':
      search_sql = ' WHERE category = 1 ORDER BY count DESC'
      sql = sql + search_sql
      file_name_postfix = '_' + category
    else:
      sql = """
      SELECT article_id, ac.name, ast.id, ast.name, sum(count) AS c, 2
      FROM rankings AS r
      LEFT JOIN articles AS ac ON r.article_id = ac.id 
      LEFT JOIN artists AS ast ON ast.id = ac.artist_id
      GROUP BY r.article_id, ac.id, ac.name, ast.name
      ORDER BY c DESC
      """
      file_name_postfix = ''
    con = sqlite3.connect(db_path)
    rows = con.execute(sql).fetchall() 
    con.close()
    # format records
    results = []
    for row in rows:
      results.append({
        'article_id':row[0],
        'article_name':row[1],
        'artist_id':row[2],
        'artist_name':row[3],
        'count':row[4]
        #'category':row[5]
      })
    file_name = 'ranking' + file_name_postfix
    f = open(work_dir + cache_dir + file_name + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return True
  #################################
  # DB Setting
  #################################
  def initDB(self):
    if os.path.exists(db_path):
      shutil.copy(db_path, db_path + '.old')
    con = sqlite3.connect(db_path)
    # Delete Old Data
    sql = """
    DROP TABLE IF EXISTS artists
    """
    #con.execute(sql)
    sql = """
    DROP TABLE IF EXISTS articles
    """
    #con.execute(sql)
    sql = """
    DROP TABLE IF EXISTS rankings
    """
    con.execute(sql)
    # Create New Table
    sql = """
    CREATE TABLE IF NOT EXISTS artists (
      id TEXT,
      name TEXT,
      kana TEXT,
      url TEXT,
      status INTEGER
    )
    """
    con.execute(sql)
    sql = """
    CREATE TABLE IF NOT EXISTS articles (
      id TEXT,
      name TEXT,
      kana TEXT,
      url TEXT,
      zip_url TEXT,
      html_url TEXT,
      artist_id TEXT,
      public_date DATE,
      status INTEGER
    )
    """
    con.execute(sql)

    sql = """
    CREATE TABLE rankings (
      article_id TEXT,
      count INTEGER,
      category INTEGER
    )
    """
    con.execute(sql)
    sql = """
    create index ranking_article_id ON rankings(article_id)
    """
    con.execute(sql)
    sql = """
    create index ranking_count ON rankings(count)
    """
    con.execute(sql)
    sql = """
    create index ranking_category ON rankings(category)
    """
    con.execute(sql)
    return con    
  #################################
  # Scrape HTML Data
  #################################
  def getArticleInfo(self, url):
    time.sleep(3)
    html = self.getHttp(url)
    if html != None:
      soup = BeautifulSoup(html)
      urls = soup.findAll('table', {'class':'download'})[0].findAll('a')
      artist_kana = soup.findAll('table')[2].findAll('td')[5].contents[0]
      article_kana = soup.findAll('table')[0].findAll('td')[3].contents[0]
      status = 0 
      if soup.find('div', {'class':'copyright'}) != None:
        status = 1 
      if 2 <= len(urls):
        return (urls[0]['href'][2:], urls[1]['href'][2:], status, article_kana, artist_kana)
    return None
  # Access Ranking
  def parseAccessRanking(self):
    html = self.getHttp(ranking_url)
    if html != None:
      soup = BeautifulSoup(html)
      urls = soup.findAll('a')
      con = sqlite3.connect(db_path)
      html_rank_url = ranking_url + urls[0]['href']
      self.saveAccessRanking(html_rank_url, con, 0)
      txt_rank_url = ranking_url + urls[1]['href']
      self.saveAccessRanking(txt_rank_url, con, 1)
      con.close()
  def saveAccessRanking(self, url, db, category=0):
    html = self.getHttp(url)
    if html != None:
      soup = BeautifulSoup(html)
      trs = soup.findAll('tr')
      n = 0
      for tr in trs: 
        if 0 < n:
          article_url = tr.find('a')['href']
          sql = """
          SELECT id FROM articles WHERE url='""" + article_url + """' LIMIT 1
          """
          rows = db.execute(sql) 
          row = rows.fetchone()
          if row != None:
            access_count = tr.findAll('td')[3].contents[0]
            sql = """
            INSERT INTO rankings (article_id, count, category)
            VALUES ('""" + row[0] + """',""" + access_count + """,
            """ + str(category) + """)
            """
            db.execute(sql)
        n += 1
      db.commit()
  #################################
  # Utility
  #################################
  # HTTP Getter
  def getHttp(self, url):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', user_agent)]
    try:
      http = opener.open(url)
      result = http.read()
      http.close()
      return result
    except urllib2.HTTPError, e:
      return None
####################################################
# Main Process
def main():
  ap = AozoraParser()
  ap.getZip()
  csvfile = ap.unZip()
  ap.parseCSV(csvfile)
  ap.parseAccessRanking()
  time.sleep(120)  
  ap.createRankingFile()
  ap.createRankingFile('text')
  ap.createRankingFile('html')
####################################################
# Main Setting
if __name__ == "__main__":
  main()

とりあえずさくらインターネットのサーバがメゲルので一定時間に500件ずつ処理するようになってます。それはともかく、結構無駄な感じの部分も多いので全体的に見直さないとイカンなぁ…精進します(´・ω・`)