You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
136 lines
4.0 KiB
136 lines
4.0 KiB
#!/usr/bin/env python3
|
|
# -*- coding: utf8 -*-
|
|
"""
|
|
Parse monster names and jp names for monster hunter X.
|
|
http://monsterhunter.wikia.com/wiki/MHX:_Monsters
|
|
|
|
Returns list of dict, e.g.:
|
|
[
|
|
{
|
|
"name": "Testucabra",
|
|
"name_jp": "...",
|
|
"title_jp": "..."
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import json
|
|
import lxml.etree
|
|
|
|
import requests
|
|
|
|
|
|
#<h3><span class="mw-headline" id="Lance">Lance</span></h3>
|
|
#<td style="vertical-align: top; background-color: #ddeeee; font-size:12pt;">Absolute Evasion<br />絶対回避
|
|
#</td><td>The hunter's body spins and evades attacks while retreating from the immediate area. Your weapon will always be sheathed after this technique.
|
|
SECTION_RE = re.compile('^<h[23]><span class="mw-headline" id="[^"]*">(?:<b>)?([^<]*)(?:</b>)?</span></h[23]>')
|
|
NAME_RE = re.compile(
|
|
'^<td style="vertical-align: top; background-color: #ddeeee; font-size:12pt;">([^<]*)<br />(.*)')
|
|
|
|
|
|
MONSTER_RE = re.compile(
|
|
'(?:</td>)?<td style="[^"]*background-color:#EBEBEB;[^"]*">\s*'
|
|
'<a href="([^"]*)" [^>]* title="([^"]*)"')
|
|
|
|
# Old, MHX
|
|
"""
|
|
MONSTER_LINK_RE = re.compile(
|
|
'<a href="(/wiki/[^/"]*)"\s+class="image image-thumbnail link-internal"\s+'
|
|
'title="([^"]*)"\s+>')
|
|
|
|
|
|
JAPANESE_NAME_STR = '<h3 class="pi-data-label pi-secondary-font">Japanese:</h3>'
|
|
JAPANESE_NAME_RE = re.compile(
|
|
'<div class="pi-data-value pi-font">(.*)</div>')
|
|
"""
|
|
|
|
MONSTER_LINK_RE = re.compile(
|
|
'<a href="(/wiki/[^/"]*)" title="([^"]*)">([^<>]+)</a>')
|
|
|
|
"""
|
|
<h2 class="pi-item pi-item-spacing pi-title" data-source="Japanese Name"><ruby lang="ja"><rb lang="ja-Hani">ドスフロギィ<br/>(Dosufurogi)</rb></ruby></h2>
|
|
"""
|
|
JAPANESE_NAME_RE = re.compile('<h2 class="pi-item pi-item-spacing pi-title" data-source="Japanese Name"><ruby lang="ja"><rb lang="[^"]*">([^<>]*)<br/>.*</rb></ruby></h2>')
|
|
JAPANESE_TITLE_RE = re.compile(
|
|
'<div class="pi-data-value pi-font">([^<>]*)</div>')
|
|
|
|
def parse_wikia_monsters(f):
|
|
section = None
|
|
data = []
|
|
seen = set()
|
|
while True:
|
|
line = f.readline()
|
|
if not line:
|
|
break
|
|
line = line.strip()
|
|
m = SECTION_RE.match(line)
|
|
if m:
|
|
section = m.group(1)
|
|
print("section", section, file=sys.stderr)
|
|
continue
|
|
if section not in ["Large Monsters", "Small Monsters"]:
|
|
continue
|
|
for m in MONSTER_LINK_RE.finditer(line):
|
|
monster = dict(href=m.group(1), name=m.group(2))
|
|
if monster["name"].startswith("File:"):
|
|
continue
|
|
if monster["name"] not in seen:
|
|
data.append(monster)
|
|
seen.add(monster["name"])
|
|
return data
|
|
|
|
|
|
def get_jp_names(monster_path):
|
|
url = "http://monsterhunter.wikia.com" + monster_path
|
|
r = requests.get(url)
|
|
root = lxml.etree.HTML(r.text)
|
|
|
|
names = []
|
|
|
|
rbs = root.xpath('//h2[@data-source="Japanese Name"]//rb')
|
|
names.append(rbs[0].text)
|
|
|
|
divs = root.xpath('//div[@data-source="Japanese Title"]//div')
|
|
if divs:
|
|
names.append(divs[0].text)
|
|
|
|
return names
|
|
|
|
|
|
def parse_japanese_name(div_contents):
|
|
parts = div_contents.split("<br />")
|
|
if len(parts) == 1:
|
|
return parts[0]
|
|
assert len(parts) == 2
|
|
# Remobra has different titles in 2nd and 4th gen, parse from
|
|
# second part and remove the paren part
|
|
if parts[1].endswith("(4th Gen)"):
|
|
return parts[1][:-len("(4th Gen)")]
|
|
return parts[0]
|
|
|
|
|
|
def _main():
|
|
with open(sys.argv[1]) as f:
|
|
monster_list = parse_wikia_monsters(f)
|
|
for m in monster_list:
|
|
name = m["name"]
|
|
names = get_jp_names(m["href"])
|
|
if len(names) == 0:
|
|
print("ERROR: no names for %s" % name, file=sys.stderr)
|
|
names = ["", ""]
|
|
if len(names) == 1:
|
|
print("ERROR: no title for %s" % name, file=sys.stderr)
|
|
names.append("")
|
|
m["name_jp"] = names[0]
|
|
m["title_jp"] = names[1]
|
|
if m["title_jp"] in ("None", "N/A", "(?)"):
|
|
m["title_jp"] = ""
|
|
print(json.dumps(monster_list, indent=2))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
_main()
|