diff --git a/requirements.txt b/requirements.txt index e30575aee..b41a60ffd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ -requests -ruamel.yaml==0.12.4 -rdflib>=4.1. -rdflib-jsonld>=0.3.0 -mistune -typing>=3.5.2 ; python_version>="2.7" -avro ; python_version<"3" +typing==3.5.2.2 ; python_version>="2.7" avro-python3 ; python_version>="3" -CacheControl -lockfile +avro==1.8.1 ; python_version<"3" +ruamel.yaml==0.12.4 +rdflib==4.2.1 +rdflib-jsonld==0.4.0 +html5lib==0.9999999 +mistune==0.7.3 +CacheControl==0.11.7 +lockfile==0.12.2 diff --git a/schema_salad/ref_resolver.py b/schema_salad/ref_resolver.py index 6acf4db39..0d2bee8b6 100644 --- a/schema_salad/ref_resolver.py +++ b/schema_salad/ref_resolver.py @@ -73,20 +73,89 @@ def merge_properties(a, b): def SubLoader(loader): # type: (Loader) -> Loader return Loader(loader.ctx, schemagraph=loader.graph, foreign_properties=loader.foreign_properties, idx=loader.idx, - cache=loader.cache, session=loader.session) + cache=loader.cache, fetcher_constructor=loader.fetcher_constructor) +class Fetcher(object): + def fetch_text(self, url): # type: (unicode) -> unicode + raise NotImplementedError() -class Loader(object): + def check_exists(self, url): # type: (unicode) -> bool + raise NotImplementedError() + + def urljoin(self, base_url, url): # type: (unicode, unicode) -> unicode + raise NotImplementedError() + + +class DefaultFetcher(Fetcher): + def __init__(self, cache, session): # type: (dict, requests.sessions.Session) -> None + self.cache = cache + self.session = session + + def fetch_text(self, url): + # type: (unicode) -> unicode + if url in self.cache: + return self.cache[url] + + split = urlparse.urlsplit(url) + scheme, path = split.scheme, split.path + + if scheme in [u'http', u'https'] and self.session: + try: + resp = self.session.get(url) + resp.raise_for_status() + except Exception as e: + raise RuntimeError(url, e) + return resp.text + elif scheme == 'file': + try: + with open(path) as fp: + read = fp.read() + if hasattr(read, "decode"): + return read.decode("utf-8") + else: + return read + except (OSError, IOError) as e: + if e.filename == path: + raise RuntimeError(unicode(e)) + else: + raise RuntimeError('Error reading %s: %s' % (url, e)) + else: + raise ValueError('Unsupported scheme in url: %s' % url) + + def check_exists(self, url): # type: (unicode) -> bool + if url in self.cache: + return True + + split = urlparse.urlsplit(url) + scheme, path = split.scheme, split.path + + if scheme in [u'http', u'https'] and self.session: + try: + resp = self.session.head(url) + resp.raise_for_status() + except Exception as e: + return False + return True + elif scheme == 'file': + return os.path.exists(path) + else: + raise ValueError('Unsupported scheme in url: %s' % url) + def urljoin(self, base_url, url): + return urlparse.urljoin(base_url, url) + +class Loader(object): def __init__(self, ctx, # type: ContextType - schemagraph=None, # type: Graph + schemagraph=None, # type: rdflib.graph.Graph foreign_properties=None, # type: Set[unicode] idx=None, # type: Dict[unicode, Union[CommentedMap, CommentedSeq, unicode]] cache=None, # type: Dict[unicode, Any] - session=None # type: requests.sessions.Session + session=None, # type: requests.sessions.Session + fetcher_constructor=None # type: Callable[[Dict[unicode, unicode], requests.sessions.Session], Fetcher] ): # type: (...) -> None + normalize = lambda url: urlparse.urlsplit(url).geturl() self.idx = None # type: Dict[unicode, Union[CommentedMap, CommentedSeq, unicode]] if idx is not None: @@ -113,12 +182,20 @@ def __init__(self, else: self.cache = {} - self.session = None # type: requests.sessions.Session - if session is not None: + if session is None: + self.session = CacheControl(requests.Session(), + cache=FileCache(os.path.join(os.environ["HOME"], ".cache", "salad"))) + else: self.session = session + + if fetcher_constructor: + self.fetcher_constructor = fetcher_constructor else: - self.session = CacheControl(requests.Session(), - cache=FileCache(os.path.join(os.environ["HOME"], ".cache", "salad"))) + self.fetcher_constructor = DefaultFetcher + self.fetcher = self.fetcher_constructor(self.cache, self.session) + + self.fetch_text = self.fetcher.fetch_text + self.check_exists = self.fetcher.check_exists self.url_fields = None # type: Set[unicode] self.scoped_ref_fields = None # type: Dict[unicode, int] @@ -171,7 +248,7 @@ def expand_url(self, elif scoped_ref is not None and not split.fragment: pass else: - url = urlparse.urljoin(base_url, url) + url = self.fetcher.urljoin(base_url, url) if vocab_term and url in self.rvocab: return self.rvocab[url] @@ -195,7 +272,7 @@ def add_namespaces(self, ns): # type: (Dict[unicode, unicode]) -> None def add_schemas(self, ns, base_url): # type: (Union[List[unicode], unicode], unicode) -> None for sch in aslist(ns): - fetchurl = urlparse.urljoin(base_url, sch) + fetchurl = self.fetcher.urljoin(base_url, sch) if fetchurl not in self.cache: _logger.debug("Getting external schema %s", fetchurl) content = self.fetch_text(fetchurl) @@ -346,6 +423,7 @@ def resolve_ref(self, if url in self.idx and (not mixin): return self.idx[url], {} + sl.raise_type = RuntimeError with sl: # "$include" directive means load raw text if inc: @@ -704,37 +782,6 @@ def resolve_all(self, return document, metadata - def fetch_text(self, url): - # type: (unicode) -> unicode - if url in self.cache: - return self.cache[url] - - split = urlparse.urlsplit(url) - scheme, path = split.scheme, split.path - - if scheme in [u'http', u'https'] and self.session: - try: - resp = self.session.get(url) - resp.raise_for_status() - except Exception as e: - raise RuntimeError(url, e) - return resp.text - elif scheme == 'file': - try: - with open(path) as fp: - read = fp.read() - if hasattr(read, "decode"): - return read.decode("utf-8") - else: - return read - except (OSError, IOError) as e: - if e.filename == path: - raise RuntimeError(unicode(e)) - else: - raise RuntimeError('Error reading %s: %s' % (url, e)) - else: - raise ValueError('Unsupported scheme in url: %s' % url) - def fetch(self, url, inject_ids=True): # type: (unicode, bool) -> Any if url in self.idx: return self.idx[url] @@ -758,21 +805,6 @@ def fetch(self, url, inject_ids=True): # type: (unicode, bool) -> Any self.idx[url] = result return result - def check_file(self, url): # type: (unicode) -> bool - split = urlparse.urlsplit(url) - scheme, path = split.scheme, split.path - - if scheme in [u'http', u'https'] and self.session: - try: - resp = self.session.head(url) - resp.raise_for_status() - except Exception as e: - return False - return True - elif scheme == 'file': - return os.path.exists(path) - else: - raise ValueError('Unsupported scheme in url: %s' % url) FieldType = TypeVar('FieldType', unicode, CommentedSeq, CommentedMap) @@ -809,13 +841,13 @@ def validate_link(self, field, link, docid): if link not in self.vocab and link not in self.idx and link not in self.rvocab: if field in self.scoped_ref_fields: return self.validate_scoped(field, link, docid) - elif not self.check_file(link): + elif not self.check_exists(link): raise validate.ValidationException( "Field `%s` contains undefined reference to `%s`" % (field, link)) elif link not in self.idx and link not in self.rvocab: if field in self.scoped_ref_fields: return self.validate_scoped(field, link, docid) - elif not self.check_file(link): + elif not self.check_exists(link): raise validate.ValidationException( "Field `%s` contains undefined reference to `%s`" % (field, link)) elif isinstance(link, CommentedSeq): diff --git a/schema_salad/schema.py b/schema_salad/schema.py index ff2d18cef..342ec4680 100644 --- a/schema_salad/schema.py +++ b/schema_salad/schema.py @@ -188,7 +188,7 @@ def load_schema(schema_ref, # type: Union[CommentedMap, CommentedSeq, unicode] metaschema_names, metaschema_doc, metaschema_loader = get_metaschema() if cache is not None: - metaschema_loader.cache = cache + metaschema_loader.cache.update(cache) schema_doc, schema_metadata = metaschema_loader.resolve_ref(schema_ref, "") if not isinstance(schema_doc, list): diff --git a/schema_salad/tests/test_fetch.py b/schema_salad/tests/test_fetch.py new file mode 100644 index 000000000..8fb9e5a69 --- /dev/null +++ b/schema_salad/tests/test_fetch.py @@ -0,0 +1,57 @@ +import unittest +import schema_salad.ref_resolver +import schema_salad.main +import schema_salad.schema +from schema_salad.jsonld_context import makerdf +import rdflib +import ruamel.yaml as yaml +import json +import os +import urlparse + +class TestFetcher(unittest.TestCase): + def test_fetcher(self): + class TestFetcher(schema_salad.ref_resolver.Fetcher): + def __init__(self, a, b): + pass + + def fetch_text(self, url): # type: (unicode) -> unicode + if url == "keep:abc+123/foo.txt": + return "hello: keepfoo" + if url.endswith("foo.txt"): + return "hello: foo" + else: + raise RuntimeError("Not foo.txt") + + def check_exists(self, url): # type: (unicode) -> bool + if url.endswith("foo.txt"): + return True + else: + return False + + def urljoin(self, base, url): + urlsp = urlparse.urlsplit(url) + if urlsp.scheme: + return url + basesp = urlparse.urlsplit(base) + + if basesp.scheme == "keep": + return base + "/" + url + return urlparse.urljoin(base, url) + + loader = schema_salad.ref_resolver.Loader({}, fetcher_constructor=TestFetcher) + self.assertEqual({"hello": "foo"}, loader.resolve_ref("foo.txt")[0]) + self.assertEqual({"hello": "keepfoo"}, loader.resolve_ref("foo.txt", base_url="keep:abc+123")[0]) + self.assertTrue(loader.check_exists("foo.txt")) + + with self.assertRaises(RuntimeError): + loader.resolve_ref("bar.txt") + self.assertFalse(loader.check_exists("bar.txt")) + + def test_cache(self): + loader = schema_salad.ref_resolver.Loader({}) + foo = "file://%s/foo.txt" % os.getcwd() + loader.cache.update({foo: "hello: foo"}) + print loader.cache + self.assertEqual({"hello": "foo"}, loader.resolve_ref("foo.txt")[0]) + self.assertTrue(loader.check_exists(foo)) diff --git a/setup.py b/setup.py index adf31ed4a..97c466663 100755 --- a/setup.py +++ b/setup.py @@ -28,13 +28,15 @@ requirements = [] install_requires = [ - 'requests', - 'ruamel.yaml == 0.12.4', - 'rdflib >= 4.1.0', - 'rdflib-jsonld >= 0.3.0', - 'mistune', - 'typing >= 3.5.2', - 'CacheControl', + 'setuptools', + 'requests >= 1.0', + 'ruamel.yaml >= 0.12.4, < 0.12.5', + 'rdflib >= 4.2.0, < 4.3.0', + 'rdflib-jsonld >= 0.3.0, < 0.5.0', + 'html5lib >= 0.90, <= 0.9999999', + 'mistune >= 0.7.3, < 0.8', + 'typing >= 3.5.2, < 3.6', + 'CacheControl >= 0.11.7, < 0.12', 'lockfile >= 0.9'] install_requires.append("avro") # TODO: remove me once cwltool is @@ -46,7 +48,7 @@ extras_require = {} # TODO: to be removed when the above is added setup(name='schema-salad', - version='2.0', + version='2.1', description='Schema Annotations for Linked Avro Data (SALAD)', long_description=open(README).read(), author='Common workflow language working group',