-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add notebook to migrate manual modification
- Loading branch information
Binh Vu
committed
Sep 9, 2023
1 parent
a6e000d
commit ba454e0
Showing
1 changed file
with
370 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,370 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "raw", | ||
"id": "80c08db5-6cc2-4fc0-bae6-e44c40495c9e", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"source": [ | ||
"!pip install deepdiff" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "e531a128-ebdc-4727-8038-c560c0b61f1d", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from os.path import expanduser\n", | ||
"\n", | ||
"WIKIDATA_DIR = expanduser(\"~/kgdata/wikidata\")\n", | ||
"WIKIPEDIA_DIR = expanduser(\"~/kgdata/wikipedia\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "f8f6798b-ecaa-4039-b372-d2429c8d9774", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import deepdiff, serde.jl, pandas as pd\n", | ||
"from loguru import logger\n", | ||
"from kgdata.wikidata.db import WikidataDB, get_class_db, get_prop_db, get_entity_db, WDClass, WDProperty\n", | ||
"from dataclasses import dataclass\n", | ||
"from typing import *\n", | ||
"from pathlib import Path" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "f8106cb5-5e20-48fd-92fe-6340f26f5911", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"@dataclass\n", | ||
"class Intervention:\n", | ||
" objid: str\n", | ||
" action: Literal[\"list:remove\", \"list:add\"]\n", | ||
" attrpath: list[str]\n", | ||
" value: str | int\n", | ||
" \n", | ||
" def apply(self, obj: dict):\n", | ||
" if self.action == 'list:remove':\n", | ||
" lst = self.get_item(obj, self.attrpath)\n", | ||
" lst.remove(self.value)\n", | ||
" elif self.action == 'list:add':\n", | ||
" lst = self.get_item(obj, self.attrpath)\n", | ||
" lst.append(self.value)\n", | ||
" else:\n", | ||
" raise NotImplementedError()\n", | ||
" \n", | ||
" def get_item(self, obj: dict, attrpath: list[str]):\n", | ||
" for attr in attrpath:\n", | ||
" obj = obj[attr]\n", | ||
" return obj\n", | ||
"\n", | ||
" def to_dict(self):\n", | ||
" return {'objid': self.objid, 'action': self.action, 'attrpath': \".\".join(self.attrpath), 'value': self.value}\n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def from_dict(obj: dict):\n", | ||
" obj['attrpath'] = obj['attrpath'].split(\".\")\n", | ||
" return Intervention(**obj)\n", | ||
" \n", | ||
" @staticmethod\n", | ||
" def from_tsv(file: Path | str) -> dict[str, list['Intervention']]:\n", | ||
" records = pd.read_csv(file, comment='#', delimiter='\\t').to_dict('records')\n", | ||
" lst = list(map(Intervention.from_dict, records))\n", | ||
" idmap = {}\n", | ||
" for item in lst:\n", | ||
" if item.objid not in idmap:\n", | ||
" idmap[item.objid] = []\n", | ||
" idmap[item.objid].append(item)\n", | ||
" return idmap" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 81, | ||
"id": "68216a31-a582-438d-886a-cad9167bf084", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"db = WikidataDB(expanduser(\"~/kgdata/databases/wikidata/20211213\"))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1eea9e4d-e43e-422c-940c-b4e59cae47e2", | ||
"metadata": {}, | ||
"source": [ | ||
"### Modify classes" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 95, | ||
"id": "34370bdd-5f6b-4615-a084-4d84e5aeee87", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"olddb = get_class_db(db.database_dir / \"classes.db\", read_only=True)\n", | ||
"\n", | ||
"id2obj = {}\n", | ||
"for obj in serde.jl.deser(db.database_dir / \"classes.fixed.jl.old\"):\n", | ||
" id2obj[obj['id']] = obj\n", | ||
"\n", | ||
"def get_label(id):\n", | ||
" return f\"{olddb[id].label} ({id})\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 96, | ||
"id": "ba9308c7-15d7-48c2-bf5c-66026ab633a2", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"id2interven = Intervention.from_tsv(\"~/kgdata/wikidata/intervention/classes.tsv\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 97, | ||
"id": "dafe3b9c-4839-496d-84c0-d7e789fe8bc0", | ||
"metadata": { | ||
"collapsed": true, | ||
"jupyter": { | ||
"outputs_hidden": true | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
">>> Process military unit (Q176799)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][2]\": 'Q781132'}}\n", | ||
"['military organization (Q15627509)', 'armed organization (Q17149090)', 'military branch (Q781132)', 'organizational subdivision (Q9261468)']\n", | ||
"['military organization (Q15627509)', 'armed organization (Q17149090)', 'organizational subdivision (Q9261468)']\n", | ||
">>> Process house (Q3947)\n", | ||
"diff {'values_changed': {\"root['parents'][0]\": {'new_value': 'Q41176', 'old_value': 'Q11755880'}}}\n", | ||
"['residential building (Q11755880)']\n", | ||
"['building (Q41176)']\n", | ||
">>> Process polysaccharides (Q134219)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][2]\": 'Q2553138'}}\n", | ||
"['carbohydrate (Q11358)', 'macromolecule (Q178593)', 'glycan (Q2553138)']\n", | ||
"['carbohydrate (Q11358)', 'macromolecule (Q178593)']\n", | ||
">>> Process financial services (Q837171)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][3]\": 'Q806750'}}\n", | ||
"['product (Q2424752)', 'business service (Q25351891)', 'service (Q7406919)', 'banking services (Q806750)']\n", | ||
"['product (Q2424752)', 'business service (Q25351891)', 'service (Q7406919)']\n", | ||
">>> Process financial transaction (Q1166072)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][0]\": 'Q1148747'}}\n", | ||
"['payment (Q1148747)', 'trade (Q601401)']\n", | ||
"['trade (Q601401)']\n", | ||
">>> Process payment (Q1148747)\n", | ||
"diff {}\n", | ||
"['financial transaction (Q1166072)']\n", | ||
"['financial transaction (Q1166072)']\n", | ||
">>> Process axiom (Q17736)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][2]\": 'Q536351'}}\n", | ||
"['proposition (Q108163)', 'statement (Q2684591)', 'first principle (Q536351)']\n", | ||
"['proposition (Q108163)', 'statement (Q2684591)']\n", | ||
">>> Process principle (Q211364)\n", | ||
"diff {}\n", | ||
"['rule (Q1151067)', 'axiom (Q17736)', 'structure (Q6671777)']\n", | ||
"['rule (Q1151067)', 'axiom (Q17736)', 'structure (Q6671777)']\n", | ||
">>> Process first principle (Q536351)\n", | ||
"diff {}\n", | ||
"['class (Q16889133)', 'principle (Q211364)']\n", | ||
"['class (Q16889133)', 'principle (Q211364)']\n", | ||
">>> Process residential building (Q11755880)\n", | ||
"diff {}\n", | ||
"['house (Q3947)', 'residence (Q699405)']\n", | ||
"['house (Q3947)', 'residence (Q699405)']\n", | ||
">>> Process military branch (Q781132)\n", | ||
"diff {}\n", | ||
"['military unit (Q176799)']\n", | ||
"['military unit (Q176799)']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for id, obj in id2obj.items():\n", | ||
" if id not in olddb:\n", | ||
" logger.error(\"{} is not in db\", get_label(id))\n", | ||
" continue\n", | ||
" \n", | ||
" print(\">>> Process\", get_label(id))\n", | ||
" oldobj = olddb[id].to_dict()\n", | ||
" obj.pop('ancestors', None)\n", | ||
" oldobj.pop('ancestors')\n", | ||
" \n", | ||
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n", | ||
" print('diff', diff)\n", | ||
" print([get_label(x) for x in oldobj['parents']])\n", | ||
" print([get_label(x) for x in obj['parents']])\n", | ||
" \n", | ||
" if id in id2interven:\n", | ||
" [mod.apply(oldobj) for mod in id2interven[id]]\n", | ||
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n", | ||
" assert diff == {}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5ed26b69-0607-44ef-b105-ad498ba89dbd", | ||
"metadata": {}, | ||
"source": [ | ||
"## Modify properties" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 100, | ||
"id": "50233f5d-e34d-4c1f-b912-b2f1f538da5a", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"olddb = get_prop_db(db.database_dir / \"props.db\", read_only=True)\n", | ||
"\n", | ||
"id2obj = {}\n", | ||
"for obj in serde.jl.deser(db.database_dir / \"props.fixed.jl.old\"):\n", | ||
" id2obj[obj['id']] = obj\n", | ||
"\n", | ||
"def get_label(id):\n", | ||
" return f\"{olddb[id].label} ({id})\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 101, | ||
"id": "75426ed4-49d9-423e-b3ac-505c66095bc7", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"id2interven = Intervention.from_tsv(\"~/kgdata/wikidata/intervention/props.tsv\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 103, | ||
"id": "3835a7e8-3e17-46ae-80b4-3093ab61af7e", | ||
"metadata": { | ||
"collapsed": true, | ||
"jupyter": { | ||
"outputs_hidden": true | ||
}, | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
">>> Process locator map image (P242)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][1]\": 'P927'}}\n", | ||
"['image (P18)', 'anatomical location (P927)']\n", | ||
"['image (P18)']\n", | ||
">>> Process detail map (P1621)\n", | ||
"diff {}\n", | ||
"['locator map image (P242)']\n", | ||
"['locator map image (P242)']\n", | ||
">>> Process route map (P15)\n", | ||
"diff {}\n", | ||
"['locator map image (P242)']\n", | ||
"['locator map image (P242)']\n", | ||
">>> Process plan view image (P3311)\n", | ||
"diff {}\n", | ||
"['detail map (P1621)']\n", | ||
"['detail map (P1621)']\n", | ||
">>> Process location (P276)\n", | ||
"diff {'iterable_item_removed': {\"root['parents'][0]\": 'P7153'}}\n", | ||
"['significant place (P7153)']\n", | ||
"[]\n", | ||
">>> Process located in the administrative territorial entity (P131)\n", | ||
"diff {}\n", | ||
"['location (P276)', 'part of (P361)']\n", | ||
"['location (P276)', 'part of (P361)']\n", | ||
">>> Process significant place (P7153)\n", | ||
"diff {}\n", | ||
"['location (P276)']\n", | ||
"['location (P276)']\n", | ||
">>> Process country (P17)\n", | ||
"diff {}\n", | ||
"['located in the administrative territorial entity (P131)']\n", | ||
"['located in the administrative territorial entity (P131)']\n", | ||
">>> Process country of citizenship (P27)\n", | ||
"diff {}\n", | ||
"['country (P17)']\n", | ||
"['country (P17)']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for id, obj in id2obj.items():\n", | ||
" if id not in olddb:\n", | ||
" logger.error(\"{} is not in db\", get_label(id))\n", | ||
" continue\n", | ||
" \n", | ||
" print(\">>> Process\", get_label(id))\n", | ||
" oldobj = olddb[id].to_dict()\n", | ||
" obj.pop('ancestors', None)\n", | ||
" oldobj.pop('ancestors')\n", | ||
" \n", | ||
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n", | ||
" print('diff', diff)\n", | ||
" print([get_label(x) for x in oldobj['parents']])\n", | ||
" print([get_label(x) for x in obj['parents']])\n", | ||
" \n", | ||
" if id in id2interven:\n", | ||
" [mod.apply(oldobj) for mod in id2interven[id]]\n", | ||
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n", | ||
" assert diff == {}" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "gramsplus", | ||
"language": "python", | ||
"name": "gramsplus" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |