You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
125 lines
3.5 KiB
125 lines
3.5 KiB
#!/usr/bin/env python
|
|
# -*- coding: utf8 -*-
|
|
"""
|
|
Parse monster names and jp names for monster hunter X.
|
|
http://monsterhunter.wikia.com/wiki/MHX:_Monsters
|
|
|
|
Returns list of dict, e.g.:
|
|
[
|
|
{
|
|
"name": "Testucabra",
|
|
"name_jp": "...",
|
|
"title_jp": "..."
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import json
|
|
from collections import defaultdict, OrderedDict
|
|
|
|
import requests
|
|
|
|
|
|
#<h3><span class="mw-headline" id="Lance">Lance</span></h3>
|
|
#<td style="vertical-align: top; background-color: #ddeeee; font-size:12pt;">Absolute Evasion<br />絶対回避
|
|
#</td><td>The hunter's body spins and evades attacks while retreating from the immediate area. Your weapon will always be sheathed after this technique.
|
|
SECTION_RE = re.compile('^<h[23]><span class="mw-headline" id="[^"]*">(?:<b>)?([^<]*)(?:</b>)?</span></h[23]>')
|
|
NAME_RE = re.compile(
|
|
'^<td style="vertical-align: top; background-color: #ddeeee; font-size:12pt;">([^<]*)<br />(.*)')
|
|
|
|
|
|
MONSTER_RE = re.compile(
|
|
'(?:</td>)?<td style="[^"]*background-color:#EBEBEB;">\s*'
|
|
'<a href="([^"]*)" [^>]* title="([^"]*)"')
|
|
|
|
|
|
JAPANESE_NAME_STR = '<h3 class="pi-data-label pi-secondary-font">Japanese:</h3>'
|
|
JAPANESE_NAME_RE = re.compile(
|
|
'<div class="pi-data-value pi-font">(.*)</div>')
|
|
|
|
|
|
def parse_wikia_monsters(f):
|
|
section = None
|
|
data = []
|
|
seen = set()
|
|
while True:
|
|
line = f.readline()
|
|
if not line:
|
|
break
|
|
line = line.strip()
|
|
m = SECTION_RE.match(line)
|
|
if m:
|
|
section = m.group(1)
|
|
print >>sys.stderr, "section", section
|
|
continue
|
|
if section != "Large Monsters":
|
|
continue
|
|
m = MONSTER_RE.search(line)
|
|
if m:
|
|
monster = dict(href=m.group(1), name=m.group(2))
|
|
if monster["name"].startswith("File:"):
|
|
continue
|
|
if monster["name"] not in seen:
|
|
data.append(monster)
|
|
seen.add(monster["name"])
|
|
return data
|
|
|
|
|
|
def get_jp_names(monster_path):
|
|
url = "http://monsterhunter.wikia.com" + monster_path
|
|
r = requests.get(url)
|
|
html = r.text
|
|
lines = r.text.split("\n")
|
|
names = []
|
|
while lines:
|
|
line = lines.pop(0).strip()
|
|
if JAPANESE_NAME_STR not in line:
|
|
continue
|
|
line = lines.pop(0).strip()
|
|
while line == "":
|
|
line = lines.pop(0).strip()
|
|
m = JAPANESE_NAME_RE.match(line)
|
|
assert m, "No match: " + line
|
|
names.append(parse_japanese_name(m.group(1)))
|
|
if len(names) == 2:
|
|
break
|
|
return names
|
|
|
|
|
|
def parse_japanese_name(div_contents):
|
|
parts = div_contents.split("<br />")
|
|
if len(parts) == 1:
|
|
return parts[0]
|
|
assert len(parts) == 2
|
|
# Remobra has different titles in 2nd and 4th gen, parse from
|
|
# second part and remove the paren part
|
|
if parts[1].endswith("(4th Gen)"):
|
|
return parts[1][:-len("(4th Gen)")]
|
|
return parts[0]
|
|
|
|
|
|
def _main():
|
|
with open(sys.argv[1]) as f:
|
|
monster_list = parse_wikia_monsters(f)
|
|
for m in monster_list:
|
|
name = m["name"]
|
|
names = get_jp_names(m["href"])
|
|
if len(names) == 0:
|
|
print >>sys.stderr, "ERROR: no names for %s" % name
|
|
names = ["", ""]
|
|
if len(names) == 1:
|
|
print >>sys.stderr, "ERROR: no title for %s" % name
|
|
names.append("")
|
|
m["name_jp"] = names[0]
|
|
m["title_jp"] = names[1]
|
|
if m["title_jp"] in ("None", "N/A", "(?)"):
|
|
m["title_jp"] = ""
|
|
print json.dumps(monster_list, indent=2)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
_main()
|