From 912d176970d3f67d52299c40d0862b5d606385de Mon Sep 17 00:00:00 2001 From: Binh Vu Date: Fri, 10 May 2024 22:30:48 -0700 Subject: [PATCH 1/2] Remove unused imports --- CHANGELOG.md | 6 ++++++ kgdata/db.py | 8 ++++---- kgdata/wikidata/db.py | 5 +++-- pyproject.toml | 2 +- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7f1b29..4b1f0dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # CHANGE LOG +## [7.0.2] (2024-05-10) + +### Fixed + +- Remove unused imports + ## [7.0.1] (2024-04-25) ### Added diff --git a/kgdata/db.py b/kgdata/db.py index e4db204..8002a4d 100644 --- a/kgdata/db.py +++ b/kgdata/db.py @@ -30,13 +30,12 @@ rocksdb_load, ) from hugedict.types import HugeMutableMapping +from loguru import logger +from sm.namespaces.namespace import KnowledgeGraphNamespace + from kgdata.models.entity import Entity, EntityMetadata -from kgdata.models.multilingual import MultiLingualString, MultiLingualStringList from kgdata.models.ont_class import OntologyClass from kgdata.models.ont_property import OntologyProperty, get_default_props -from loguru import logger -from rdflib import RDF, RDFS, XSD -from sm.namespaces.namespace import KnowledgeGraphNamespace if TYPE_CHECKING: from hugedict.core.rocksdb import FileFormat @@ -378,3 +377,4 @@ def cli(data_dir: str, dbname: str, keys: list[str]): break cli() + cli() diff --git a/kgdata/wikidata/db.py b/kgdata/wikidata/db.py index d31a18a..f8a0c99 100644 --- a/kgdata/wikidata/db.py +++ b/kgdata/wikidata/db.py @@ -23,6 +23,8 @@ import serde.jl as jl from hugedict.prelude import CacheDict, RocksDBDict from hugedict.types import HugeMutableMapping +from sm.misc.funcs import import_func + from kgdata.db import ( GenericDB, deser_from_dict, @@ -40,14 +42,12 @@ unpack_int, ) from kgdata.models.entity import EntityOutLinks -from kgdata.wikidata.datasets.mention_to_entities import MentionToEntities from kgdata.wikidata.extra_ent_db import EntAttr, get_entity_attr_db from kgdata.wikidata.models import WDClass, WDProperty from kgdata.wikidata.models.wdentity import WDEntity from kgdata.wikidata.models.wdentitylabel import WDEntityLabel from kgdata.wikidata.models.wdentitylink import WDEntityWikiLink from kgdata.wikidata.models.wdentitymetadata import WDEntityMetadata -from sm.misc.funcs import import_func V = TypeVar("V", WDEntity, WDClass, WDProperty, WDEntityLabel, WDEntityWikiLink) @@ -448,3 +448,4 @@ def cli(data_dir: str, dbname: str, format: str, keys: list[str]): break cli() + cli() diff --git a/pyproject.toml b/pyproject.toml index 92d2976..45a3635 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kgdata" -version = "7.0.1" +version = "7.0.2" description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)" readme = "README.md" authors = [{ name = "Binh Vu", email = "binh@toan2.com" }] From 35c92cf36cd4cae2deb2f0a43a8e8274a2061945 Mon Sep 17 00:00:00 2001 From: Binh Vu Date: Sat, 11 May 2024 00:27:02 -0700 Subject: [PATCH 2/2] Fix code to not require `spark` or `ray` unless really need it --- CHANGELOG.md | 6 ++++++ kgdata/dataset.py | 7 ++++--- kgdata/misc/optional_import.py | 26 ++++++++++++++++++++++++++ kgdata/spark/common.py | 18 ++++++++++++------ kgdata/spark/extended_rdd.py | 6 +++--- kgdata/wikidata/__main__.py | 10 ++++------ kgdata/wikidata/datasets/entities.py | 2 +- pyproject.toml | 8 ++++---- 8 files changed, 60 insertions(+), 23 deletions(-) create mode 100644 kgdata/misc/optional_import.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b1f0dc..3b6f227 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # CHANGE LOG +## [7.0.4] (2024-05-11) + +### Fixed + +- Fix code to not require `spark` or `ray` unless really need it + ## [7.0.2] (2024-05-10) ### Fixed diff --git a/kgdata/dataset.py b/kgdata/dataset.py index 97a120b..abd195c 100644 --- a/kgdata/dataset.py +++ b/kgdata/dataset.py @@ -25,14 +25,15 @@ import serde.json import serde.textline from hugedict.misc import Chain2, identity +from loguru import logger +from tqdm.auto import tqdm + from kgdata.config import init_dbdir_from_env +from kgdata.misc.optional_import import RDD from kgdata.misc.query import PropQuery, every from kgdata.spark import ExtendedRDD, SparkLikeInterface, get_spark_context from kgdata.spark.common import does_result_dir_exist, text_file from kgdata.spark.extended_rdd import DatasetSignature -from loguru import logger -from pyspark import RDD -from tqdm.auto import tqdm V = TypeVar("V") V2 = TypeVar("V2") diff --git a/kgdata/misc/optional_import.py b/kgdata/misc/optional_import.py new file mode 100644 index 0000000..2c02fe8 --- /dev/null +++ b/kgdata/misc/optional_import.py @@ -0,0 +1,26 @@ +from functools import wraps + +try: + from pyspark import Broadcast, SparkConf, SparkContext, TaskContext + from pyspark.rdd import RDD, portable_hash + has_spark = True +except ImportError: + RDD = None + SparkContext = None + TaskContext = None + SparkConf = None + portable_hash = None + Broadcast = None + + has_spark = False + + + +def require_spark(func): + @wraps(func) + def wrapper(*args, **kwargs): + if not has_spark: + raise ImportError("pyspark is required for function: %s" % func.__name__) + return func(*args, **kwargs) + + return wrapper diff --git a/kgdata/spark/common.py b/kgdata/spark/common.py index 9818fd6..045c059 100644 --- a/kgdata/spark/common.py +++ b/kgdata/spark/common.py @@ -27,11 +27,18 @@ import orjson import zstandard as zstd -from kgdata.misc.funcs import deser_zstd_records from loguru import logger -from pyspark import RDD, SparkConf, SparkContext, TaskContext from sm.misc.funcs import assert_not_null +from kgdata.misc.funcs import deser_zstd_records +from kgdata.misc.optional_import import ( + RDD, + SparkConf, + SparkContext, + TaskContext, + require_spark, +) + # SparkContext singleton _sc = None StrPath = Union[Path, str] @@ -46,6 +53,7 @@ V2 = TypeVar("V2") +@require_spark def get_spark_context() -> SparkContext: """Get spark context @@ -459,7 +467,7 @@ def fix_rdd(): # template to fix an rdd # ############################### # TODO: set input file - indir = "/nas/home/binhvu/workspace/sm-dev/data/wikidata/step_2/schema" + indir = os.path.expanduser("~/workspace/sm-dev/data/wikidata/step_2/schema") infile = indir + "/class_schema" # ############################### @@ -634,6 +642,4 @@ def convert(x): @dataclass class EmptyBroadcast(Generic[V]): - value: V - value: V - value: V + value: V \ No newline at end of file diff --git a/kgdata/spark/extended_rdd.py b/kgdata/spark/extended_rdd.py index 2b39f3e..2aeab3d 100644 --- a/kgdata/spark/extended_rdd.py +++ b/kgdata/spark/extended_rdd.py @@ -23,7 +23,9 @@ ) import serde.json -from kgdata.misc.funcs import deser_zstd_records +from typing_extensions import TypeGuard + +from kgdata.misc.optional_import import RDD, portable_hash from kgdata.spark.common import ( StrPath, are_records_unique, @@ -34,8 +36,6 @@ save_as_text_file, text_file, ) -from pyspark.rdd import RDD, portable_hash -from typing_extensions import TypeGuard if TYPE_CHECKING: from kgdata.dataset import Dataset diff --git a/kgdata/wikidata/__main__.py b/kgdata/wikidata/__main__.py index 516bdc3..652bd86 100644 --- a/kgdata/wikidata/__main__.py +++ b/kgdata/wikidata/__main__.py @@ -17,6 +17,9 @@ rocksdb_build_sst_file, rocksdb_ingest_sst_files, ) +from sm.misc.ray_helper import ray_map, ray_put +from timer import Timer + from kgdata.config import init_dbdir_from_env from kgdata.db import build_database from kgdata.wikidata.config import WikidataDirCfg @@ -24,7 +27,6 @@ from kgdata.wikidata.datasets.property_count import property_count from kgdata.wikidata.db import WikidataDB, get_ontcount_db, pack_int from kgdata.wikidata.extra_ent_db import EntAttr, build_extra_ent_db -from timer import Timer if TYPE_CHECKING: from hugedict.core.rocksdb import FileFormat @@ -248,11 +250,7 @@ def db_ontcount(directory: str, output: str, compact: bool, lang: str): ).options gc.collect() - import ray - from sm.misc.ray_helper import ray_map, ray_put - - ray.init() - + def _build_sst_file( infile: str, temp_dir: str, posfix: str, options: RocksDBOptions ): diff --git a/kgdata/wikidata/datasets/entities.py b/kgdata/wikidata/datasets/entities.py index b59d98a..05d6447 100644 --- a/kgdata/wikidata/datasets/entities.py +++ b/kgdata/wikidata/datasets/entities.py @@ -3,11 +3,11 @@ import orjson from loguru import logger -from pyspark import Broadcast from sm.misc.funcs import filter_duplication, is_not_null from kgdata.dataset import Dataset from kgdata.misc.funcs import split_tab_2 +from kgdata.misc.optional_import import Broadcast from kgdata.spark import get_spark_context from kgdata.wikidata.config import WikidataDirCfg from kgdata.wikidata.datasets.entity_dump import entity_dump diff --git a/pyproject.toml b/pyproject.toml index 45a3635..5281ec9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kgdata" -version = "7.0.2" +version = "7.0.4" description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)" readme = "README.md" authors = [{ name = "Binh Vu", email = "binh@toan2.com" }] @@ -25,13 +25,12 @@ dependencies = [ 'redis >= 3.5.3, < 4.0.0', 'numpy >= 1.22.3, < 2.0.0', 'requests >= 2.28.0, < 3.0.0', - 'sem-desc >= 6.7.2, < 7.0.0', + 'sem-desc >= 6.11.2, < 7.0.0', 'click >= 8.1.3, < 9.0.0', 'parsimonious >= 0.8.1, < 0.9.0', 'hugedict >= 2.12.10, < 3.0.0', 'rsoup >= 3.1.7, < 4.0.0', 'lxml >= 4.9.0, < 5.0.0', - 'ray >= 2.0.1, < 3.0.0', 'pqdict >= 1.3.0, < 2.0.0', 'ftfy >= 6.1.3, < 7.0.0', ] @@ -47,7 +46,8 @@ dev = [ 'black >= 22.10.0, < 23.0.0', ] spark = ['pyspark >= 3.5.0, < 4.0.0'] -all = ["kgdata[dev,spark]"] +ray = ['ray >= 2.0.1, < 3.0.0'] +all = ["kgdata[dev,spark,ray]"] [tool.maturin] module-name = "kgdata.core"