#!/usr/bin/env python3 # -*- coding: utf8 -*- """ Parse monster names and jp names for monster hunter X. http://monsterhunter.wikia.com/wiki/MHX:_Monsters Returns list of dict, e.g.: [ { "name": "Testucabra", "name_jp": "...", "title_jp": "..." }, ... ] """ import sys import re import json import lxml.etree import requests #

Lance

#Absolute Evasion
絶対回避 #The hunter's body spins and evades attacks while retreating from the immediate area. Your weapon will always be sheathed after this technique. SECTION_RE = re.compile('^(?:)?([^<]*)(?:)?') NAME_RE = re.compile( '^([^<]*)
(.*)') MONSTER_RE = re.compile( '(?:)?\s*' ']* title="([^"]*)"') # Old, MHX """ MONSTER_LINK_RE = re.compile( '') JAPANESE_NAME_STR = '

Japanese:

' JAPANESE_NAME_RE = re.compile( '

(.*)

') """ MONSTER_LINK_RE = re.compile( '([^<>]+)') """

ドスフロギィ
(Dosufurogi)

""" JAPANESE_NAME_RE = re.compile('

([^<>])
.

') JAPANESE_TITLE_RE = re.compile( '

([^<>]*)

') def parse_wikia_monsters(f): section = None data = [] seen = set() while True: line = f.readline() if not line: break line = line.strip() m = SECTION_RE.match(line) if m: section = m.group(1) print("section", section, file=sys.stderr) continue if section not in ["Large Monsters", "Small Monsters"]: continue for m in MONSTER_LINK_RE.finditer(line): monster = dict(href=m.group(1), name=m.group(2)) if monster["name"].startswith("File:"): continue if monster["name"] not in seen: data.append(monster) seen.add(monster["name"]) return data def get_jp_names(monster_path): url = "http://monsterhunter.wikia.com" + monster_path r = requests.get(url) root = lxml.etree.HTML(r.text) names = [] rbs = root.xpath('//h2[@data-source="Japanese Name"]//rb') names.append(rbs[0].text) divs = root.xpath('//div[@data-source="Japanese Title"]//div') if divs: names.append(divs[0].text) return names def parse_japanese_name(div_contents): parts = div_contents.split("
") if len(parts) == 1: return parts[0] assert len(parts) == 2 # Remobra has different titles in 2nd and 4th gen, parse from # second part and remove the paren part if parts[1].endswith("(4th Gen)"): return parts[1][:-len("(4th Gen)")] return parts[0] def _main(): with open(sys.argv[1]) as f: monster_list = parse_wikia_monsters(f) for m in monster_list: name = m["name"] names = get_jp_names(m["href"]) if len(names) == 0: print("ERROR: no names for %s" % name, file=sys.stderr) names = ["", ""] if len(names) == 1: print("ERROR: no title for %s" % name, file=sys.stderr) names.append("") m["name_jp"] = names[0] m["title_jp"] = names[1] if m["title_jp"] in ("None", "N/A", "(?)"): m["title_jp"] = "" print(json.dumps(monster_list, indent=2)) if __name__ == '__main__': _main()

Lance

Japanese:

ドスフロギィ(Dosufurogi)

([^<>]*).*

ドスフロギィ
(Dosufurogi)

([^<>])
.