Skip to content

Commit

Permalink
Merge branch 'master' of github.com:binh-vu/kgdata
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Jun 7, 2024
2 parents 9215c2b + 35c92cf commit ab3f2e0
Show file tree
Hide file tree
Showing 10 changed files with 73 additions and 29 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# CHANGE LOG

## [7.0.4] (2024-05-11)

### Fixed

- Fix code to not require `spark` or `ray` unless really need it

## [7.0.2] (2024-05-10)

### Fixed

- Remove unused imports

## [7.0.1] (2024-04-25)

### Added
Expand Down
7 changes: 4 additions & 3 deletions kgdata/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
import serde.json
import serde.textline
from hugedict.misc import Chain2, identity
from loguru import logger
from tqdm.auto import tqdm

from kgdata.config import init_dbdir_from_env
from kgdata.misc.optional_import import RDD
from kgdata.misc.query import PropQuery, every
from kgdata.spark import ExtendedRDD, SparkLikeInterface, get_spark_context
from kgdata.spark.common import does_result_dir_exist, text_file
from kgdata.spark.extended_rdd import DatasetSignature
from loguru import logger
from pyspark import RDD
from tqdm.auto import tqdm

V = TypeVar("V")
V2 = TypeVar("V2")
Expand Down
8 changes: 4 additions & 4 deletions kgdata/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,12 @@
rocksdb_load,
)
from hugedict.types import HugeMutableMapping
from loguru import logger
from sm.namespaces.namespace import KnowledgeGraphNamespace

from kgdata.models.entity import Entity, EntityMetadata
from kgdata.models.multilingual import MultiLingualString, MultiLingualStringList
from kgdata.models.ont_class import OntologyClass
from kgdata.models.ont_property import OntologyProperty, get_default_props
from loguru import logger
from rdflib import RDF, RDFS, XSD
from sm.namespaces.namespace import KnowledgeGraphNamespace

if TYPE_CHECKING:
from hugedict.core.rocksdb import FileFormat
Expand Down Expand Up @@ -378,3 +377,4 @@ def cli(data_dir: str, dbname: str, keys: list[str]):
break

cli()
cli()
26 changes: 26 additions & 0 deletions kgdata/misc/optional_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from functools import wraps

try:
from pyspark import Broadcast, SparkConf, SparkContext, TaskContext
from pyspark.rdd import RDD, portable_hash
has_spark = True
except ImportError:
RDD = None
SparkContext = None
TaskContext = None
SparkConf = None
portable_hash = None
Broadcast = None

has_spark = False



def require_spark(func):
@wraps(func)
def wrapper(*args, **kwargs):
if not has_spark:
raise ImportError("pyspark is required for function: %s" % func.__name__)
return func(*args, **kwargs)

return wrapper
18 changes: 12 additions & 6 deletions kgdata/spark/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,18 @@

import orjson
import zstandard as zstd
from kgdata.misc.funcs import deser_zstd_records
from loguru import logger
from pyspark import RDD, SparkConf, SparkContext, TaskContext
from sm.misc.funcs import assert_not_null

from kgdata.misc.funcs import deser_zstd_records
from kgdata.misc.optional_import import (
RDD,
SparkConf,
SparkContext,
TaskContext,
require_spark,
)

# SparkContext singleton
_sc = None
StrPath = Union[Path, str]
Expand All @@ -46,6 +53,7 @@
V2 = TypeVar("V2")


@require_spark
def get_spark_context() -> SparkContext:
"""Get spark context
Expand Down Expand Up @@ -459,7 +467,7 @@ def fix_rdd():
# template to fix an rdd
# ###############################
# TODO: set input file
indir = "/nas/home/binhvu/workspace/sm-dev/data/wikidata/step_2/schema"
indir = os.path.expanduser("~/workspace/sm-dev/data/wikidata/step_2/schema")
infile = indir + "/class_schema"

# ###############################
Expand Down Expand Up @@ -634,6 +642,4 @@ def convert(x):

@dataclass
class EmptyBroadcast(Generic[V]):
value: V
value: V
value: V
value: V
6 changes: 3 additions & 3 deletions kgdata/spark/extended_rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
)

import serde.json
from kgdata.misc.funcs import deser_zstd_records
from typing_extensions import TypeGuard

from kgdata.misc.optional_import import RDD, portable_hash
from kgdata.spark.common import (
StrPath,
are_records_unique,
Expand All @@ -34,8 +36,6 @@
save_as_text_file,
text_file,
)
from pyspark.rdd import RDD, portable_hash
from typing_extensions import TypeGuard

if TYPE_CHECKING:
from kgdata.dataset import Dataset
Expand Down
10 changes: 4 additions & 6 deletions kgdata/wikidata/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@
rocksdb_build_sst_file,
rocksdb_ingest_sst_files,
)
from sm.misc.ray_helper import ray_map, ray_put
from timer import Timer

from kgdata.config import init_dbdir_from_env
from kgdata.db import build_database
from kgdata.wikidata.config import WikidataDirCfg
from kgdata.wikidata.datasets.class_count import class_count
from kgdata.wikidata.datasets.property_count import property_count
from kgdata.wikidata.db import WikidataDB, get_ontcount_db, pack_int
from kgdata.wikidata.extra_ent_db import EntAttr, build_extra_ent_db
from timer import Timer

if TYPE_CHECKING:
from hugedict.core.rocksdb import FileFormat
Expand Down Expand Up @@ -248,11 +250,7 @@ def db_ontcount(directory: str, output: str, compact: bool, lang: str):
).options
gc.collect()

import ray
from sm.misc.ray_helper import ray_map, ray_put

ray.init()


def _build_sst_file(
infile: str, temp_dir: str, posfix: str, options: RocksDBOptions
):
Expand Down
2 changes: 1 addition & 1 deletion kgdata/wikidata/datasets/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

import orjson
from loguru import logger
from pyspark import Broadcast
from sm.misc.funcs import filter_duplication, is_not_null

from kgdata.dataset import Dataset
from kgdata.misc.funcs import split_tab_2
from kgdata.misc.optional_import import Broadcast
from kgdata.spark import get_spark_context
from kgdata.wikidata.config import WikidataDirCfg
from kgdata.wikidata.datasets.entity_dump import entity_dump
Expand Down
5 changes: 3 additions & 2 deletions kgdata/wikidata/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import serde.jl as jl
from hugedict.prelude import CacheDict, RocksDBDict
from hugedict.types import HugeMutableMapping
from sm.misc.funcs import import_func

from kgdata.db import (
GenericDB,
deser_from_dict,
Expand All @@ -40,14 +42,12 @@
unpack_int,
)
from kgdata.models.entity import EntityOutLinks
from kgdata.wikidata.datasets.mention_to_entities import MentionToEntities
from kgdata.wikidata.extra_ent_db import EntAttr, get_entity_attr_db
from kgdata.wikidata.models import WDClass, WDProperty
from kgdata.wikidata.models.wdentity import WDEntity
from kgdata.wikidata.models.wdentitylabel import WDEntityLabel
from kgdata.wikidata.models.wdentitylink import WDEntityWikiLink
from kgdata.wikidata.models.wdentitymetadata import WDEntityMetadata
from sm.misc.funcs import import_func

V = TypeVar("V", WDEntity, WDClass, WDProperty, WDEntityLabel, WDEntityWikiLink)

Expand Down Expand Up @@ -448,3 +448,4 @@ def cli(data_dir: str, dbname: str, format: str, keys: list[str]):
break

cli()
cli()
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "kgdata"
version = "7.0.1"
version = "7.0.4"
description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)"
readme = "README.md"
authors = [{ name = "Binh Vu", email = "binh@toan2.com" }]
Expand All @@ -25,13 +25,12 @@ dependencies = [
'redis >= 3.5.3, < 4.0.0',
'numpy >= 1.22.3, < 2.0.0',
'requests >= 2.28.0, < 3.0.0',
'sem-desc >= 6.7.2, < 7.0.0',
'sem-desc >= 6.11.2, < 7.0.0',
'click >= 8.1.3, < 9.0.0',
'parsimonious >= 0.8.1, < 0.9.0',
'hugedict >= 2.12.10, < 3.0.0',
'rsoup >= 3.1.7, < 4.0.0',
'lxml >= 4.9.0, < 5.0.0',
'ray >= 2.0.1, < 3.0.0',
'pqdict >= 1.3.0, < 2.0.0',
'ftfy >= 6.1.3, < 7.0.0',
]
Expand All @@ -47,7 +46,8 @@ dev = [
'black >= 22.10.0, < 23.0.0',
]
spark = ['pyspark >= 3.5.0, < 4.0.0']
all = ["kgdata[dev,spark]"]
ray = ['ray >= 2.0.1, < 3.0.0']
all = ["kgdata[dev,spark,ray]"]

[tool.maturin]
module-name = "kgdata.core"
Expand Down

0 comments on commit ab3f2e0

Please sign in to comment.