Skip to content

Commit

Permalink
correct deser function for entity label database
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Sep 22, 2023
1 parent bcc1222 commit c7d4e5d
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 42 deletions.
19 changes: 10 additions & 9 deletions kgdata/wikidata/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
import ray
import serde.jl
import serde.json
from loguru import logger
from timer import Timer

from hugedict.cachedict import CacheDict
from hugedict.prelude import (
RocksDBDict,
Expand All @@ -21,10 +24,6 @@
rocksdb_ingest_sst_files,
rocksdb_load,
)
from loguru import logger
from sm.misc.ray_helper import ray_map
from timer import Timer

from kgdata.config import init_dbdir_from_env
from kgdata.dataset import import_dataset
from kgdata.spark.extended_rdd import DatasetSignature
Expand All @@ -40,6 +39,7 @@
)
from kgdata.wikidata.extra_ent_db import EntAttr, build_extra_ent_db
from kgdata.wikidata.models.wdentitylabel import WDEntityLabel
from sm.misc.ray_helper import ray_map

if TYPE_CHECKING:
from hugedict.hugedict.rocksdb import FileFormat
Expand Down Expand Up @@ -195,7 +195,7 @@ def db_entities_attr(attr: EntAttr, output: str, compact: bool, lang: str):
)
wikidata.add_command(
dataset2db(
"entity_outlinks",
dataset="entity_outlinks",
format={
"record_type": {"type": "ndjson", "key": "id", "value": None},
"is_sorted": False,
Expand All @@ -206,8 +206,8 @@ def db_entities_attr(attr: EntAttr, output: str, compact: bool, lang: str):
wikidata.add_command(dataset2db("properties", "props"))
wikidata.add_command(
dataset2db(
"property_domains",
"prop_domains",
dataset="property_domains",
dbname="prop_domains",
format={
"record_type": {"type": "tuple2", "key": None, "value": None},
"is_sorted": False,
Expand All @@ -216,8 +216,8 @@ def db_entities_attr(attr: EntAttr, output: str, compact: bool, lang: str):
)
wikidata.add_command(
dataset2db(
"property_ranges",
"prop_ranges",
dataset="property_ranges",
dbname="prop_ranges",
format={
"record_type": {"type": "tuple2", "key": None, "value": None},
"is_sorted": False,
Expand Down Expand Up @@ -254,6 +254,7 @@ def db_ontcount(directory: str, output: str, compact: bool, lang: str):
gc.collect()

import ray

from sm.misc.ray_helper import ray_map, ray_put

ray.init()
Expand Down
6 changes: 3 additions & 3 deletions kgdata/wikidata/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
import orjson
import requests
import serde.jl as jl

from hugedict.prelude import CacheDict, RocksDBDict
from hugedict.types import HugeMutableMapping

from kgdata.db import (
deser_from_dict,
get_rocksdb,
Expand Down Expand Up @@ -205,8 +205,8 @@ def __call__(
dbopts=small_dbopts,
)
get_entity_label_db = make_get_rocksdb(
deser_value=partial(deser_from_dict, EntityLabel),
ser_value=ser_to_dict,
deser_value=partial(str, encoding="utf-8"),
ser_value=str.encode,
dbopts=small_dbopts,
)
get_entity_pagerank_db = make_get_rocksdb(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "kgdata"
version = "5.0.0a9"
version = "5.0.0"
description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)"
readme = "README.md"
authors = [{ name = "Binh Vu", email = "binh@toan2.com" }]
Expand Down
60 changes: 31 additions & 29 deletions scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,35 +44,35 @@ function wikidata_db {
# ======================================================================
# WIKIDATA Datasets

# uncomment to sign the dump files to avoid re-processing dump file
# export KGDATA_FORCE_DISABLE_CHECK_SIGNATURE=1
# python -m kgdata.wikidata.datasets -d entity_dump --sign
# python -m kgdata.wikidata.datasets -d entity_redirection_dump --sign
# python -m kgdata.wikidata.datasets -d page_dump --sign

wikidata_dataset page_ids
wikidata_dataset entity_ids
wikidata_dataset entity_redirections
wikidata_dataset entities
wikidata_dataset entity_types

wikidata_dataset classes
wikidata_dataset properties

wikidata_dataset class_count
wikidata_dataset property_count
wikidata_dataset property_domains
wikidata_dataset property_ranges

wikidata_dataset cross_wiki_mapping

wikidata_dataset entity_metadata
wikidata_dataset entity_all_types
wikidata_dataset entity_degrees
wikidata_dataset entity_labels
wikidata_dataset entity_types_and_degrees
wikidata_dataset entity_outlinks
wikidata_dataset entity_pagerank
# # NOTE: uncomment to sign the dump files to avoid re-processing dump file
# # export KGDATA_FORCE_DISABLE_CHECK_SIGNATURE=1
# # python -m kgdata.wikidata.datasets -d entity_dump --sign
# # python -m kgdata.wikidata.datasets -d entity_redirection_dump --sign
# # python -m kgdata.wikidata.datasets -d page_dump --sign

# wikidata_dataset page_ids
# wikidata_dataset entity_ids
# wikidata_dataset entity_redirections
# wikidata_dataset entities
# wikidata_dataset entity_types

# wikidata_dataset classes
# wikidata_dataset properties

# wikidata_dataset class_count
# wikidata_dataset property_count
# wikidata_dataset property_domains
# wikidata_dataset property_ranges

# wikidata_dataset cross_wiki_mapping

# wikidata_dataset entity_metadata
# wikidata_dataset entity_all_types
# wikidata_dataset entity_degrees
# wikidata_dataset entity_labels
# wikidata_dataset entity_types_and_degrees
# wikidata_dataset entity_outlinks
# wikidata_dataset entity_pagerank

# ======================================================================
# WIKIPEDIA Datasets
Expand Down Expand Up @@ -100,3 +100,5 @@ wikidata_db entity_outlinks
wikidata_db entity_redirections
wikidata_db wp2wd
wikidata_db entity_pagerank
wikidata_db property_domains
wikidata_db property_ranges

0 comments on commit c7d4e5d

Please sign in to comment.