merge 1.21 changes into salad 2.0 (#68)

* Loader constructor can accept custom "Fetcher" object for fetching files and checking links. * Add test for custom fetcher feature. * Fetcher is a constructor instead of an object. Fix load_schema to update cache instead of replacing it. * Add cache test. check_exists checks cache. * Fetcher includes custom urljoin. * Fix fetcher_constructor to default to None instead of DefaultFetcher. * Adjust package dependencies to be more specific about versions. * Linting * Tweak versioning to reduce chance of future unpleasant suprises from 3rd party upgrades and clean up requirements.txt. * Bump to 2.1
common-workflow-language · Dec 16, 2016 · 5a15c58 · 5a15c58
1 parent a1db0ac
commit 5a15c58
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 76 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
-requests
-ruamel.yaml==0.12.4
-rdflib>=4.1.
-rdflib-jsonld>=0.3.0
-mistune
-typing>=3.5.2 ; python_version>="2.7"
-avro ; python_version<"3"
+typing==3.5.2.2 ; python_version>="2.7"
 avro-python3 ; python_version>="3"
-CacheControl
-lockfile
+avro==1.8.1 ; python_version<"3"
+ruamel.yaml==0.12.4
+rdflib==4.2.1
+rdflib-jsonld==0.4.0
+html5lib==0.9999999
+mistune==0.7.3
+CacheControl==0.11.7
+lockfile==0.12.2
diff --git a/schema_salad/ref_resolver.py b/schema_salad/ref_resolver.py
@@ -73,20 +73,89 @@ def merge_properties(a, b):
 def SubLoader(loader):  # type: (Loader) -> Loader
     return Loader(loader.ctx, schemagraph=loader.graph,
                   foreign_properties=loader.foreign_properties, idx=loader.idx,
-                  cache=loader.cache, session=loader.session)
+                  cache=loader.cache, fetcher_constructor=loader.fetcher_constructor)
 
+class Fetcher(object):
+    def fetch_text(self, url):    # type: (unicode) -> unicode
+        raise NotImplementedError()
 
-class Loader(object):
+    def check_exists(self, url):  # type: (unicode) -> bool
+        raise NotImplementedError()
+
+    def urljoin(self, base_url, url):  # type: (unicode, unicode) -> unicode
+        raise NotImplementedError()
+
+
+class DefaultFetcher(Fetcher):
+    def __init__(self, cache, session):  # type: (dict, requests.sessions.Session) -> None
+        self.cache = cache
+        self.session = session
+
+    def fetch_text(self, url):
+        # type: (unicode) -> unicode
+        if url in self.cache:
+            return self.cache[url]
+
+        split = urlparse.urlsplit(url)
+        scheme, path = split.scheme, split.path
+
+        if scheme in [u'http', u'https'] and self.session:
+            try:
+                resp = self.session.get(url)
+                resp.raise_for_status()
+            except Exception as e:
+                raise RuntimeError(url, e)
+            return resp.text
+        elif scheme == 'file':
+            try:
+                with open(path) as fp:
+                    read = fp.read()
+                if hasattr(read, "decode"):
+                    return read.decode("utf-8")
+                else:
+                    return read
+            except (OSError, IOError) as e:
+                if e.filename == path:
+                    raise RuntimeError(unicode(e))
+                else:
+                    raise RuntimeError('Error reading %s: %s' % (url, e))
+        else:
+            raise ValueError('Unsupported scheme in url: %s' % url)
+
+    def check_exists(self, url):  # type: (unicode) -> bool
+        if url in self.cache:
+            return True
+
+        split = urlparse.urlsplit(url)
+        scheme, path = split.scheme, split.path
+
+        if scheme in [u'http', u'https'] and self.session:
+            try:
+                resp = self.session.head(url)
+                resp.raise_for_status()
+            except Exception as e:
+                return False
+            return True
+        elif scheme == 'file':
+            return os.path.exists(path)
+        else:
+            raise ValueError('Unsupported scheme in url: %s' % url)
 
+    def urljoin(self, base_url, url):
+        return urlparse.urljoin(base_url, url)
+
+class Loader(object):
     def __init__(self,
                  ctx,                       # type: ContextType
-                 schemagraph=None,          # type: Graph
+                 schemagraph=None,          # type: rdflib.graph.Graph
                  foreign_properties=None,   # type: Set[unicode]
                  idx=None,                  # type: Dict[unicode, Union[CommentedMap, CommentedSeq, unicode]]
                  cache=None,                # type: Dict[unicode, Any]
-                 session=None               # type: requests.sessions.Session
+                 session=None,              # type: requests.sessions.Session
+                 fetcher_constructor=None   # type: Callable[[Dict[unicode, unicode], requests.sessions.Session], Fetcher]
                  ):
         # type: (...) -> None
+
         normalize = lambda url: urlparse.urlsplit(url).geturl()
         self.idx = None     # type: Dict[unicode, Union[CommentedMap, CommentedSeq, unicode]]
         if idx is not None:
@@ -113,12 +182,20 @@ def __init__(self,
         else:
             self.cache = {}
 
-        self.session = None  # type: requests.sessions.Session
-        if session is not None:
+        if session is None:
+            self.session = CacheControl(requests.Session(),
+                                   cache=FileCache(os.path.join(os.environ["HOME"], ".cache", "salad")))
+        else:
             self.session = session
+
+        if fetcher_constructor:
+            self.fetcher_constructor = fetcher_constructor
         else:
-            self.session = CacheControl(requests.Session(),
-                                        cache=FileCache(os.path.join(os.environ["HOME"], ".cache", "salad")))
+            self.fetcher_constructor = DefaultFetcher
+        self.fetcher = self.fetcher_constructor(self.cache, self.session)
+
+        self.fetch_text = self.fetcher.fetch_text
+        self.check_exists = self.fetcher.check_exists
 
         self.url_fields = None          # type: Set[unicode]
         self.scoped_ref_fields = None   # type: Dict[unicode, int]
@@ -171,7 +248,7 @@ def expand_url(self,
         elif scoped_ref is not None and not split.fragment:
             pass
         else:
-            url = urlparse.urljoin(base_url, url)
+            url = self.fetcher.urljoin(base_url, url)
 
         if vocab_term and url in self.rvocab:
             return self.rvocab[url]
@@ -195,7 +272,7 @@ def add_namespaces(self, ns):  # type: (Dict[unicode, unicode]) -> None
     def add_schemas(self, ns, base_url):
         # type: (Union[List[unicode], unicode], unicode) -> None
         for sch in aslist(ns):
-            fetchurl = urlparse.urljoin(base_url, sch)
+            fetchurl = self.fetcher.urljoin(base_url, sch)
             if fetchurl not in self.cache:
                 _logger.debug("Getting external schema %s", fetchurl)
                 content = self.fetch_text(fetchurl)
@@ -346,6 +423,7 @@ def resolve_ref(self,
         if url in self.idx and (not mixin):
             return self.idx[url], {}
 
+        sl.raise_type = RuntimeError
         with sl:
             # "$include" directive means load raw text
             if inc:
@@ -704,37 +782,6 @@ def resolve_all(self,
 
         return document, metadata
 
-    def fetch_text(self, url):
-        # type: (unicode) -> unicode
-        if url in self.cache:
-            return self.cache[url]
-
-        split = urlparse.urlsplit(url)
-        scheme, path = split.scheme, split.path
-
-        if scheme in [u'http', u'https'] and self.session:
-            try:
-                resp = self.session.get(url)
-                resp.raise_for_status()
-            except Exception as e:
-                raise RuntimeError(url, e)
-            return resp.text
-        elif scheme == 'file':
-            try:
-                with open(path) as fp:
-                    read = fp.read()
-                if hasattr(read, "decode"):
-                    return read.decode("utf-8")
-                else:
-                    return read
-            except (OSError, IOError) as e:
-                if e.filename == path:
-                    raise RuntimeError(unicode(e))
-                else:
-                    raise RuntimeError('Error reading %s: %s' % (url, e))
-        else:
-            raise ValueError('Unsupported scheme in url: %s' % url)
-
     def fetch(self, url, inject_ids=True):  # type: (unicode, bool) -> Any
         if url in self.idx:
             return self.idx[url]
@@ -758,21 +805,6 @@ def fetch(self, url, inject_ids=True):  # type: (unicode, bool) -> Any
             self.idx[url] = result
         return result
 
-    def check_file(self, url):  # type: (unicode) -> bool
-        split = urlparse.urlsplit(url)
-        scheme, path = split.scheme, split.path
-
-        if scheme in [u'http', u'https'] and self.session:
-            try:
-                resp = self.session.head(url)
-                resp.raise_for_status()
-            except Exception as e:
-                return False
-            return True
-        elif scheme == 'file':
-            return os.path.exists(path)
-        else:
-            raise ValueError('Unsupported scheme in url: %s' % url)
 
     FieldType = TypeVar('FieldType', unicode, CommentedSeq, CommentedMap)
 
@@ -809,13 +841,13 @@ def validate_link(self, field, link, docid):
                 if link not in self.vocab and link not in self.idx and link not in self.rvocab:
                     if field in self.scoped_ref_fields:
                         return self.validate_scoped(field, link, docid)
-                    elif not self.check_file(link):
+                    elif not self.check_exists(link):
                         raise validate.ValidationException(
                             "Field `%s` contains undefined reference to `%s`" % (field, link))
             elif link not in self.idx and link not in self.rvocab:
                 if field in self.scoped_ref_fields:
                     return self.validate_scoped(field, link, docid)
-                elif not self.check_file(link):
+                elif not self.check_exists(link):
                     raise validate.ValidationException(
                         "Field `%s` contains undefined reference to `%s`" % (field, link))
         elif isinstance(link, CommentedSeq):

diff --git a/schema_salad/schema.py b/schema_salad/schema.py
@@ -188,7 +188,7 @@ def load_schema(schema_ref,  # type: Union[CommentedMap, CommentedSeq, unicode]
 
     metaschema_names, metaschema_doc, metaschema_loader = get_metaschema()
     if cache is not None:
-        metaschema_loader.cache = cache
+        metaschema_loader.cache.update(cache)
     schema_doc, schema_metadata = metaschema_loader.resolve_ref(schema_ref, "")
 
     if not isinstance(schema_doc, list):

diff --git a/schema_salad/tests/test_fetch.py b/schema_salad/tests/test_fetch.py
@@ -0,0 +1,57 @@
+import unittest
+import schema_salad.ref_resolver
+import schema_salad.main
+import schema_salad.schema
+from schema_salad.jsonld_context import makerdf
+import rdflib
+import ruamel.yaml as yaml
+import json
+import os
+import urlparse
+
+class TestFetcher(unittest.TestCase):
+    def test_fetcher(self):
+        class TestFetcher(schema_salad.ref_resolver.Fetcher):
+            def __init__(self, a, b):
+                pass
+
+            def fetch_text(self, url):    # type: (unicode) -> unicode
+                if url == "keep:abc+123/foo.txt":
+                    return "hello: keepfoo"
+                if url.endswith("foo.txt"):
+                    return "hello: foo"
+                else:
+                    raise RuntimeError("Not foo.txt")
+
+            def check_exists(self, url):  # type: (unicode) -> bool
+                if url.endswith("foo.txt"):
+                    return True
+                else:
+                    return False
+
+            def urljoin(self, base, url):
+                urlsp = urlparse.urlsplit(url)
+                if urlsp.scheme:
+                    return url
+                basesp = urlparse.urlsplit(base)
+
+                if basesp.scheme == "keep":
+                    return base + "/" + url
+                return urlparse.urljoin(base, url)
+
+        loader = schema_salad.ref_resolver.Loader({}, fetcher_constructor=TestFetcher)
+        self.assertEqual({"hello": "foo"}, loader.resolve_ref("foo.txt")[0])
+        self.assertEqual({"hello": "keepfoo"}, loader.resolve_ref("foo.txt", base_url="keep:abc+123")[0])
+        self.assertTrue(loader.check_exists("foo.txt"))
+
+        with self.assertRaises(RuntimeError):
+            loader.resolve_ref("bar.txt")
+        self.assertFalse(loader.check_exists("bar.txt"))
+
+    def test_cache(self):
+        loader = schema_salad.ref_resolver.Loader({})
+        foo = "file://%s/foo.txt" % os.getcwd()
+        loader.cache.update({foo: "hello: foo"})
+        print loader.cache
+        self.assertEqual({"hello": "foo"}, loader.resolve_ref("foo.txt")[0])
+        self.assertTrue(loader.check_exists(foo))
diff --git a/setup.py b/setup.py
@@ -28,13 +28,15 @@
     requirements = []
 
 install_requires = [
-    'requests',
-    'ruamel.yaml == 0.12.4',
-    'rdflib >= 4.1.0',
-    'rdflib-jsonld >= 0.3.0',
-    'mistune',
-    'typing >= 3.5.2',
-    'CacheControl',
+    'setuptools',
+    'requests >= 1.0',
+    'ruamel.yaml >= 0.12.4, < 0.12.5',
+    'rdflib >= 4.2.0, < 4.3.0',
+    'rdflib-jsonld >= 0.3.0, < 0.5.0',
+    'html5lib >= 0.90, <= 0.9999999',
+    'mistune >= 0.7.3, < 0.8',
+    'typing >= 3.5.2, < 3.6',
+    'CacheControl >= 0.11.7, < 0.12',
     'lockfile >= 0.9']
 
 install_requires.append("avro")  # TODO: remove me once cwltool is
@@ -46,7 +48,7 @@
 extras_require = {}               # TODO: to be removed when the above is added
 
 setup(name='schema-salad',
-      version='2.0',
+      version='2.1',
       description='Schema Annotations for Linked Avro Data (SALAD)',
       long_description=open(README).read(),
       author='Common workflow language working group',