]* title="([^"]*)"')
JAPANESE_NAME_STR = 'Japanese:
'
JAPANESE_NAME_RE = re.compile(
'([^<]*)')
def parse_wikia_monsters(f):
section = None
data = []
seen = set()
while True:
line = f.readline()
if not line:
break
line = line.strip()
m = SECTION_RE.match(line)
if m:
section = m.group(1)
print >>sys.stderr, "section", section
continue
if section != "Large Monsters":
continue
m = MONSTER_RE.search(line)
if m:
monster = dict(href=m.group(1), name=m.group(2))
if monster["name"] not in seen:
data.append(monster)
seen.add(monster["name"])
return data
def get_jp_names(monster_path):
url = "http://monsterhunter.wikia.com" + monster_path
r = requests.get(url)
html = r.text
lines = r.text.split("\n")
names = []
while lines:
line = lines.pop(0).strip()
if JAPANESE_NAME_STR not in line:
continue
line = lines.pop(0).strip()
while line == "":
line = lines.pop(0).strip()
m = JAPANESE_NAME_RE.match(line)
assert m, "No match: " + line
names.append(m.group(1))
if len(names) == 2:
break
return names
def _main():
with open(sys.argv[1]) as f:
monster_list = parse_wikia_monsters(f)
for m in monster_list:
name = m["name"]
names = get_jp_names(m["href"])
if len(names) == 0:
print >>sys.stderr, "ERROR: no names for %s" % name
names = ["(?)", "(?)"]
if len(names) == 1:
names.append("(?)")
m["name_jp"] = names[0]
m["title_jp"] = names[1]
print json.dumps(monster_list, indent=2)
if __name__ == '__main__':
_main()