Skip to content

Commit

Permalink
add notebook to migrate manual modification
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Sep 9, 2023
1 parent a6e000d commit ba454e0
Showing 1 changed file with 370 additions and 0 deletions.
370 changes: 370 additions & 0 deletions scripts/modifications.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,370 @@
{
"cells": [
{
"cell_type": "raw",
"id": "80c08db5-6cc2-4fc0-bae6-e44c40495c9e",
"metadata": {
"tags": []
},
"source": [
"!pip install deepdiff"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e531a128-ebdc-4727-8038-c560c0b61f1d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from os.path import expanduser\n",
"\n",
"WIKIDATA_DIR = expanduser(\"~/kgdata/wikidata\")\n",
"WIKIPEDIA_DIR = expanduser(\"~/kgdata/wikipedia\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f8f6798b-ecaa-4039-b372-d2429c8d9774",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import deepdiff, serde.jl, pandas as pd\n",
"from loguru import logger\n",
"from kgdata.wikidata.db import WikidataDB, get_class_db, get_prop_db, get_entity_db, WDClass, WDProperty\n",
"from dataclasses import dataclass\n",
"from typing import *\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f8106cb5-5e20-48fd-92fe-6340f26f5911",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"@dataclass\n",
"class Intervention:\n",
" objid: str\n",
" action: Literal[\"list:remove\", \"list:add\"]\n",
" attrpath: list[str]\n",
" value: str | int\n",
" \n",
" def apply(self, obj: dict):\n",
" if self.action == 'list:remove':\n",
" lst = self.get_item(obj, self.attrpath)\n",
" lst.remove(self.value)\n",
" elif self.action == 'list:add':\n",
" lst = self.get_item(obj, self.attrpath)\n",
" lst.append(self.value)\n",
" else:\n",
" raise NotImplementedError()\n",
" \n",
" def get_item(self, obj: dict, attrpath: list[str]):\n",
" for attr in attrpath:\n",
" obj = obj[attr]\n",
" return obj\n",
"\n",
" def to_dict(self):\n",
" return {'objid': self.objid, 'action': self.action, 'attrpath': \".\".join(self.attrpath), 'value': self.value}\n",
" \n",
" @staticmethod\n",
" def from_dict(obj: dict):\n",
" obj['attrpath'] = obj['attrpath'].split(\".\")\n",
" return Intervention(**obj)\n",
" \n",
" @staticmethod\n",
" def from_tsv(file: Path | str) -> dict[str, list['Intervention']]:\n",
" records = pd.read_csv(file, comment='#', delimiter='\\t').to_dict('records')\n",
" lst = list(map(Intervention.from_dict, records))\n",
" idmap = {}\n",
" for item in lst:\n",
" if item.objid not in idmap:\n",
" idmap[item.objid] = []\n",
" idmap[item.objid].append(item)\n",
" return idmap"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "68216a31-a582-438d-886a-cad9167bf084",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"db = WikidataDB(expanduser(\"~/kgdata/databases/wikidata/20211213\"))"
]
},
{
"cell_type": "markdown",
"id": "1eea9e4d-e43e-422c-940c-b4e59cae47e2",
"metadata": {},
"source": [
"### Modify classes"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "34370bdd-5f6b-4615-a084-4d84e5aeee87",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"olddb = get_class_db(db.database_dir / \"classes.db\", read_only=True)\n",
"\n",
"id2obj = {}\n",
"for obj in serde.jl.deser(db.database_dir / \"classes.fixed.jl.old\"):\n",
" id2obj[obj['id']] = obj\n",
"\n",
"def get_label(id):\n",
" return f\"{olddb[id].label} ({id})\""
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "ba9308c7-15d7-48c2-bf5c-66026ab633a2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"id2interven = Intervention.from_tsv(\"~/kgdata/wikidata/intervention/classes.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "dafe3b9c-4839-496d-84c0-d7e789fe8bc0",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
">>> Process military unit (Q176799)\n",
"diff {'iterable_item_removed': {\"root['parents'][2]\": 'Q781132'}}\n",
"['military organization (Q15627509)', 'armed organization (Q17149090)', 'military branch (Q781132)', 'organizational subdivision (Q9261468)']\n",
"['military organization (Q15627509)', 'armed organization (Q17149090)', 'organizational subdivision (Q9261468)']\n",
">>> Process house (Q3947)\n",
"diff {'values_changed': {\"root['parents'][0]\": {'new_value': 'Q41176', 'old_value': 'Q11755880'}}}\n",
"['residential building (Q11755880)']\n",
"['building (Q41176)']\n",
">>> Process polysaccharides (Q134219)\n",
"diff {'iterable_item_removed': {\"root['parents'][2]\": 'Q2553138'}}\n",
"['carbohydrate (Q11358)', 'macromolecule (Q178593)', 'glycan (Q2553138)']\n",
"['carbohydrate (Q11358)', 'macromolecule (Q178593)']\n",
">>> Process financial services (Q837171)\n",
"diff {'iterable_item_removed': {\"root['parents'][3]\": 'Q806750'}}\n",
"['product (Q2424752)', 'business service (Q25351891)', 'service (Q7406919)', 'banking services (Q806750)']\n",
"['product (Q2424752)', 'business service (Q25351891)', 'service (Q7406919)']\n",
">>> Process financial transaction (Q1166072)\n",
"diff {'iterable_item_removed': {\"root['parents'][0]\": 'Q1148747'}}\n",
"['payment (Q1148747)', 'trade (Q601401)']\n",
"['trade (Q601401)']\n",
">>> Process payment (Q1148747)\n",
"diff {}\n",
"['financial transaction (Q1166072)']\n",
"['financial transaction (Q1166072)']\n",
">>> Process axiom (Q17736)\n",
"diff {'iterable_item_removed': {\"root['parents'][2]\": 'Q536351'}}\n",
"['proposition (Q108163)', 'statement (Q2684591)', 'first principle (Q536351)']\n",
"['proposition (Q108163)', 'statement (Q2684591)']\n",
">>> Process principle (Q211364)\n",
"diff {}\n",
"['rule (Q1151067)', 'axiom (Q17736)', 'structure (Q6671777)']\n",
"['rule (Q1151067)', 'axiom (Q17736)', 'structure (Q6671777)']\n",
">>> Process first principle (Q536351)\n",
"diff {}\n",
"['class (Q16889133)', 'principle (Q211364)']\n",
"['class (Q16889133)', 'principle (Q211364)']\n",
">>> Process residential building (Q11755880)\n",
"diff {}\n",
"['house (Q3947)', 'residence (Q699405)']\n",
"['house (Q3947)', 'residence (Q699405)']\n",
">>> Process military branch (Q781132)\n",
"diff {}\n",
"['military unit (Q176799)']\n",
"['military unit (Q176799)']\n"
]
}
],
"source": [
"for id, obj in id2obj.items():\n",
" if id not in olddb:\n",
" logger.error(\"{} is not in db\", get_label(id))\n",
" continue\n",
" \n",
" print(\">>> Process\", get_label(id))\n",
" oldobj = olddb[id].to_dict()\n",
" obj.pop('ancestors', None)\n",
" oldobj.pop('ancestors')\n",
" \n",
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n",
" print('diff', diff)\n",
" print([get_label(x) for x in oldobj['parents']])\n",
" print([get_label(x) for x in obj['parents']])\n",
" \n",
" if id in id2interven:\n",
" [mod.apply(oldobj) for mod in id2interven[id]]\n",
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n",
" assert diff == {}"
]
},
{
"cell_type": "markdown",
"id": "5ed26b69-0607-44ef-b105-ad498ba89dbd",
"metadata": {},
"source": [
"## Modify properties"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "50233f5d-e34d-4c1f-b912-b2f1f538da5a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"olddb = get_prop_db(db.database_dir / \"props.db\", read_only=True)\n",
"\n",
"id2obj = {}\n",
"for obj in serde.jl.deser(db.database_dir / \"props.fixed.jl.old\"):\n",
" id2obj[obj['id']] = obj\n",
"\n",
"def get_label(id):\n",
" return f\"{olddb[id].label} ({id})\""
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "75426ed4-49d9-423e-b3ac-505c66095bc7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"id2interven = Intervention.from_tsv(\"~/kgdata/wikidata/intervention/props.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "3835a7e8-3e17-46ae-80b4-3093ab61af7e",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
">>> Process locator map image (P242)\n",
"diff {'iterable_item_removed': {\"root['parents'][1]\": 'P927'}}\n",
"['image (P18)', 'anatomical location (P927)']\n",
"['image (P18)']\n",
">>> Process detail map (P1621)\n",
"diff {}\n",
"['locator map image (P242)']\n",
"['locator map image (P242)']\n",
">>> Process route map (P15)\n",
"diff {}\n",
"['locator map image (P242)']\n",
"['locator map image (P242)']\n",
">>> Process plan view image (P3311)\n",
"diff {}\n",
"['detail map (P1621)']\n",
"['detail map (P1621)']\n",
">>> Process location (P276)\n",
"diff {'iterable_item_removed': {\"root['parents'][0]\": 'P7153'}}\n",
"['significant place (P7153)']\n",
"[]\n",
">>> Process located in the administrative territorial entity (P131)\n",
"diff {}\n",
"['location (P276)', 'part of (P361)']\n",
"['location (P276)', 'part of (P361)']\n",
">>> Process significant place (P7153)\n",
"diff {}\n",
"['location (P276)']\n",
"['location (P276)']\n",
">>> Process country (P17)\n",
"diff {}\n",
"['located in the administrative territorial entity (P131)']\n",
"['located in the administrative territorial entity (P131)']\n",
">>> Process country of citizenship (P27)\n",
"diff {}\n",
"['country (P17)']\n",
"['country (P17)']\n"
]
}
],
"source": [
"for id, obj in id2obj.items():\n",
" if id not in olddb:\n",
" logger.error(\"{} is not in db\", get_label(id))\n",
" continue\n",
" \n",
" print(\">>> Process\", get_label(id))\n",
" oldobj = olddb[id].to_dict()\n",
" obj.pop('ancestors', None)\n",
" oldobj.pop('ancestors')\n",
" \n",
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n",
" print('diff', diff)\n",
" print([get_label(x) for x in oldobj['parents']])\n",
" print([get_label(x) for x in obj['parents']])\n",
" \n",
" if id in id2interven:\n",
" [mod.apply(oldobj) for mod in id2interven[id]]\n",
" diff = deepdiff.diff.DeepDiff(oldobj, obj)\n",
" assert diff == {}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "gramsplus",
"language": "python",
"name": "gramsplus"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit ba454e0

Please sign in to comment.