diff --git a/db/mhx/locations.json b/db/mhx/locations.json new file mode 100644 index 0000000..0d6af2e --- /dev/null +++ b/db/mhx/locations.json @@ -0,0 +1,90 @@ +[ +{ + "name": "Forest and Hills", + "name_jp": "森丘" +}, +{ + "name": "Swamp", + "name_jp": "沼地" +}, +{ + "name": "Snowy Mountains", + "name_jp": "雪山" +}, +{ + "name": "Volcano", + "name_jp": "火山" +}, +{ + "name": "Deserted Island", + "name_jp": "孤島" +}, +{ + "name": "Misty Peaks", + "name_jp": "渓流" +}, +{ + "name": "Sacred Mountain", + "name_jp": "霊峰" +}, +{ + "name": "Ancestral Steppe", + "name_jp": "遺跡平原" +}, +{ + "name": "Sunken Hollow", + "name_jp": "地下洞窟" +}, +{ + "name": "Primal Forest", + "name_jp": "原生林" +}, +{ + "name": "Frozen Seaway", + "name_jp": "氷海" +}, +{ + "name": "Volcanic Hollow", + "name_jp": "地底火山" +}, +{ + "name": "Sanctuary", + "name_jp": "禁足地" +}, +{ + "name": "Dunes", + "name_jp": "旧砂漠" +}, +{ + "name": "Ancient Forest", + "name_jp": "古代森" +}, +{ + "name": "Wyvern Graveyard", + "name_jp": "竜ノ墓場" +}, +{ + "name": "Tower 3", + "name_jp": "塔の秘竟" +}, +{ + "name": "Ingle Isle", + "name_jp": "溶岩島" +}, +{ + "name": "Polar Field", + "name_jp": "極圏" +}, +{ + "name": "Arena", + "name_jp": "闘技場" +}, +{ + "name": "Slayground", + "name_jp": "立体闘技場" +}, +{ + "name": "Moat Arena", + "name_jp": "水上闘技場" +} +] diff --git a/scrapers/wikia-palico-skills.py b/scrapers/wikia-palico-skills.py new file mode 100755 index 0000000..7a58779 --- /dev/null +++ b/scrapers/wikia-palico-skills.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python2 +# vim: set fileencoding=utf8 : + +import urllib +import os +import json +import sys + +from lxml import etree + +import _pathfix + +_BASE_URL = "http://monsterhunter.wikia.com/wiki/" + +_PAGE = "MHX:_Palico_Skills" + +_CIRCLE = u"\u26ab" + + +def extract_arts_and_skills(tree): + arts = [] + skills = [] + tables = tree.xpath( + '//*[@id="mw-content-text"]/table[contains(@class, "linetable")]' + ) + for table in tables: + category = None + fields = None + rows = list(table) + for row in rows: + cols, is_header = _get_column_cells_texts(row) + print is_header, cols + continue + if is_header: + if len(cols) == 1: + category = cols[0] + else: + fields = [_header_to_field_name(c) for c in cols] + else: + if fields[0].startswith("art_"): + if category == "Forte Specific (Unteachable)": + values = dict(name=cols[0], + name_jp=cols[1], + forte=cols[2], + cost=int(cols[3]), + unlock_requirement=None, + teaching_requirement=None, + description=cols[4], + teachable=False) + elif category == "Forte Specific (Teachable)": + values = dict(name=cols[0], + name_jp=cols[1], + forte="%s %s" % (cols[2], cols[3]), + cost=int(cols[4]), + unlock_requirement=None, + teaching_requirement=cols[5], + description=cols[6], + teachable=True) + else: + values = dict(name=cols[0], + name_jp=cols[1], + forte="All", + cost=int(cols[2]), + unlock_requirement=cols[3], + teaching_requirement=None, + description=cols[4], + teachable=True) + arts.append(values) + elif fields[0].startswith("skill_"): + values = dict(name=cols[0], + name_jp=cols[1], + req_level=cols[2], + cost=cols[3].count(_CIRCLE), + description=cols[4], + category=category) + skills.append(values) + else: + raise ValueError("Unknown table type: %r" % cols[0]) + #print rows[0].text, len(rows) + return arts, skills + + +def _get_column_cells_texts(tr_element): + is_header = True + cells = tr_element.xpath("./th") + if not cells: + is_header = False + cells = tr_element.xpath("./td") + texts = [] + for cell in cells: + texts = [t.strip() for t in cell.xpath("./text()")] + return texts, is_header + + +def _header_to_field_name(s): + return s.lower().replace(" ", "_").replace(".", "") + + +def _main(): + tmp_path = os.path.join(_pathfix.project_path, "tmp") + fpath = os.path.join(tmp_path, "wikia-palico-skills.html") + parser = etree.HTMLParser() + urllib.urlretrieve(_BASE_URL + _PAGE, fpath) + with open(fpath) as f: + tree = etree.parse(f, parser) + arts, skills = extract_arts_and_skills(tree) + #print json.dumps(weapon_list, indent=2) + print json.dumps(arts, indent=2) + print json.dumps(skills, indent=2) + + +if __name__ == '__main__': + _main()