Skip to content

Commit

Permalink
merge 1.21 changes into salad 2.0 (#68)
Browse files Browse the repository at this point in the history
* Loader constructor can accept custom "Fetcher" object for fetching files and
checking links.
* Add test for custom fetcher feature.
* Fetcher is a constructor instead of an object.  Fix load_schema to update cache instead of replacing it.
* Add cache test.  check_exists checks cache.
* Fetcher includes custom urljoin.
* Fix fetcher_constructor to default to None instead of DefaultFetcher.
* Adjust package dependencies to be more specific about versions.
* Linting
* Tweak versioning to reduce chance of future unpleasant suprises from 3rd party
upgrades and clean up requirements.txt.
* Bump to 2.1
  • Loading branch information
tetron committed Dec 16, 2016
1 parent a1db0ac commit 5a15c58
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 76 deletions.
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
requests
ruamel.yaml==0.12.4
rdflib>=4.1.
rdflib-jsonld>=0.3.0
mistune
typing>=3.5.2 ; python_version>="2.7"
avro ; python_version<"3"
typing==3.5.2.2 ; python_version>="2.7"
avro-python3 ; python_version>="3"
CacheControl
lockfile
avro==1.8.1 ; python_version<"3"
ruamel.yaml==0.12.4
rdflib==4.2.1
rdflib-jsonld==0.4.0
html5lib==0.9999999
mistune==0.7.3
CacheControl==0.11.7
lockfile==0.12.2
148 changes: 90 additions & 58 deletions schema_salad/ref_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,89 @@ def merge_properties(a, b):
def SubLoader(loader): # type: (Loader) -> Loader
return Loader(loader.ctx, schemagraph=loader.graph,
foreign_properties=loader.foreign_properties, idx=loader.idx,
cache=loader.cache, session=loader.session)
cache=loader.cache, fetcher_constructor=loader.fetcher_constructor)

class Fetcher(object):
def fetch_text(self, url): # type: (unicode) -> unicode
raise NotImplementedError()

class Loader(object):
def check_exists(self, url): # type: (unicode) -> bool
raise NotImplementedError()

def urljoin(self, base_url, url): # type: (unicode, unicode) -> unicode
raise NotImplementedError()


class DefaultFetcher(Fetcher):
def __init__(self, cache, session): # type: (dict, requests.sessions.Session) -> None
self.cache = cache
self.session = session

def fetch_text(self, url):
# type: (unicode) -> unicode
if url in self.cache:
return self.cache[url]

split = urlparse.urlsplit(url)
scheme, path = split.scheme, split.path

if scheme in [u'http', u'https'] and self.session:
try:
resp = self.session.get(url)
resp.raise_for_status()
except Exception as e:
raise RuntimeError(url, e)
return resp.text
elif scheme == 'file':
try:
with open(path) as fp:
read = fp.read()
if hasattr(read, "decode"):
return read.decode("utf-8")
else:
return read
except (OSError, IOError) as e:
if e.filename == path:
raise RuntimeError(unicode(e))
else:
raise RuntimeError('Error reading %s: %s' % (url, e))
else:
raise ValueError('Unsupported scheme in url: %s' % url)

def check_exists(self, url): # type: (unicode) -> bool
if url in self.cache:
return True

split = urlparse.urlsplit(url)
scheme, path = split.scheme, split.path

if scheme in [u'http', u'https'] and self.session:
try:
resp = self.session.head(url)
resp.raise_for_status()
except Exception as e:
return False
return True
elif scheme == 'file':
return os.path.exists(path)
else:
raise ValueError('Unsupported scheme in url: %s' % url)

def urljoin(self, base_url, url):
return urlparse.urljoin(base_url, url)

class Loader(object):
def __init__(self,
ctx, # type: ContextType
schemagraph=None, # type: Graph
schemagraph=None, # type: rdflib.graph.Graph
foreign_properties=None, # type: Set[unicode]
idx=None, # type: Dict[unicode, Union[CommentedMap, CommentedSeq, unicode]]
cache=None, # type: Dict[unicode, Any]
session=None # type: requests.sessions.Session
session=None, # type: requests.sessions.Session
fetcher_constructor=None # type: Callable[[Dict[unicode, unicode], requests.sessions.Session], Fetcher]
):
# type: (...) -> None

normalize = lambda url: urlparse.urlsplit(url).geturl()
self.idx = None # type: Dict[unicode, Union[CommentedMap, CommentedSeq, unicode]]
if idx is not None:
Expand All @@ -113,12 +182,20 @@ def __init__(self,
else:
self.cache = {}

self.session = None # type: requests.sessions.Session
if session is not None:
if session is None:
self.session = CacheControl(requests.Session(),
cache=FileCache(os.path.join(os.environ["HOME"], ".cache", "salad")))
else:
self.session = session

if fetcher_constructor:
self.fetcher_constructor = fetcher_constructor
else:
self.session = CacheControl(requests.Session(),
cache=FileCache(os.path.join(os.environ["HOME"], ".cache", "salad")))
self.fetcher_constructor = DefaultFetcher
self.fetcher = self.fetcher_constructor(self.cache, self.session)

self.fetch_text = self.fetcher.fetch_text
self.check_exists = self.fetcher.check_exists

self.url_fields = None # type: Set[unicode]
self.scoped_ref_fields = None # type: Dict[unicode, int]
Expand Down Expand Up @@ -171,7 +248,7 @@ def expand_url(self,
elif scoped_ref is not None and not split.fragment:
pass
else:
url = urlparse.urljoin(base_url, url)
url = self.fetcher.urljoin(base_url, url)

if vocab_term and url in self.rvocab:
return self.rvocab[url]
Expand All @@ -195,7 +272,7 @@ def add_namespaces(self, ns): # type: (Dict[unicode, unicode]) -> None
def add_schemas(self, ns, base_url):
# type: (Union[List[unicode], unicode], unicode) -> None
for sch in aslist(ns):
fetchurl = urlparse.urljoin(base_url, sch)
fetchurl = self.fetcher.urljoin(base_url, sch)
if fetchurl not in self.cache:
_logger.debug("Getting external schema %s", fetchurl)
content = self.fetch_text(fetchurl)
Expand Down Expand Up @@ -346,6 +423,7 @@ def resolve_ref(self,
if url in self.idx and (not mixin):
return self.idx[url], {}

sl.raise_type = RuntimeError
with sl:
# "$include" directive means load raw text
if inc:
Expand Down Expand Up @@ -704,37 +782,6 @@ def resolve_all(self,

return document, metadata

def fetch_text(self, url):
# type: (unicode) -> unicode
if url in self.cache:
return self.cache[url]

split = urlparse.urlsplit(url)
scheme, path = split.scheme, split.path

if scheme in [u'http', u'https'] and self.session:
try:
resp = self.session.get(url)
resp.raise_for_status()
except Exception as e:
raise RuntimeError(url, e)
return resp.text
elif scheme == 'file':
try:
with open(path) as fp:
read = fp.read()
if hasattr(read, "decode"):
return read.decode("utf-8")
else:
return read
except (OSError, IOError) as e:
if e.filename == path:
raise RuntimeError(unicode(e))
else:
raise RuntimeError('Error reading %s: %s' % (url, e))
else:
raise ValueError('Unsupported scheme in url: %s' % url)

def fetch(self, url, inject_ids=True): # type: (unicode, bool) -> Any
if url in self.idx:
return self.idx[url]
Expand All @@ -758,21 +805,6 @@ def fetch(self, url, inject_ids=True): # type: (unicode, bool) -> Any
self.idx[url] = result
return result

def check_file(self, url): # type: (unicode) -> bool
split = urlparse.urlsplit(url)
scheme, path = split.scheme, split.path

if scheme in [u'http', u'https'] and self.session:
try:
resp = self.session.head(url)
resp.raise_for_status()
except Exception as e:
return False
return True
elif scheme == 'file':
return os.path.exists(path)
else:
raise ValueError('Unsupported scheme in url: %s' % url)

FieldType = TypeVar('FieldType', unicode, CommentedSeq, CommentedMap)

Expand Down Expand Up @@ -809,13 +841,13 @@ def validate_link(self, field, link, docid):
if link not in self.vocab and link not in self.idx and link not in self.rvocab:
if field in self.scoped_ref_fields:
return self.validate_scoped(field, link, docid)
elif not self.check_file(link):
elif not self.check_exists(link):
raise validate.ValidationException(
"Field `%s` contains undefined reference to `%s`" % (field, link))
elif link not in self.idx and link not in self.rvocab:
if field in self.scoped_ref_fields:
return self.validate_scoped(field, link, docid)
elif not self.check_file(link):
elif not self.check_exists(link):
raise validate.ValidationException(
"Field `%s` contains undefined reference to `%s`" % (field, link))
elif isinstance(link, CommentedSeq):
Expand Down
2 changes: 1 addition & 1 deletion schema_salad/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def load_schema(schema_ref, # type: Union[CommentedMap, CommentedSeq, unicode]

metaschema_names, metaschema_doc, metaschema_loader = get_metaschema()
if cache is not None:
metaschema_loader.cache = cache
metaschema_loader.cache.update(cache)
schema_doc, schema_metadata = metaschema_loader.resolve_ref(schema_ref, "")

if not isinstance(schema_doc, list):
Expand Down
57 changes: 57 additions & 0 deletions schema_salad/tests/test_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import unittest
import schema_salad.ref_resolver
import schema_salad.main
import schema_salad.schema
from schema_salad.jsonld_context import makerdf
import rdflib
import ruamel.yaml as yaml
import json
import os
import urlparse

class TestFetcher(unittest.TestCase):
def test_fetcher(self):
class TestFetcher(schema_salad.ref_resolver.Fetcher):
def __init__(self, a, b):
pass

def fetch_text(self, url): # type: (unicode) -> unicode
if url == "keep:abc+123/foo.txt":
return "hello: keepfoo"
if url.endswith("foo.txt"):
return "hello: foo"
else:
raise RuntimeError("Not foo.txt")

def check_exists(self, url): # type: (unicode) -> bool
if url.endswith("foo.txt"):
return True
else:
return False

def urljoin(self, base, url):
urlsp = urlparse.urlsplit(url)
if urlsp.scheme:
return url
basesp = urlparse.urlsplit(base)

if basesp.scheme == "keep":
return base + "/" + url
return urlparse.urljoin(base, url)

loader = schema_salad.ref_resolver.Loader({}, fetcher_constructor=TestFetcher)
self.assertEqual({"hello": "foo"}, loader.resolve_ref("foo.txt")[0])
self.assertEqual({"hello": "keepfoo"}, loader.resolve_ref("foo.txt", base_url="keep:abc+123")[0])
self.assertTrue(loader.check_exists("foo.txt"))

with self.assertRaises(RuntimeError):
loader.resolve_ref("bar.txt")
self.assertFalse(loader.check_exists("bar.txt"))

def test_cache(self):
loader = schema_salad.ref_resolver.Loader({})
foo = "file://%s/foo.txt" % os.getcwd()
loader.cache.update({foo: "hello: foo"})
print loader.cache
self.assertEqual({"hello": "foo"}, loader.resolve_ref("foo.txt")[0])
self.assertTrue(loader.check_exists(foo))
18 changes: 10 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@
requirements = []

install_requires = [
'requests',
'ruamel.yaml == 0.12.4',
'rdflib >= 4.1.0',
'rdflib-jsonld >= 0.3.0',
'mistune',
'typing >= 3.5.2',
'CacheControl',
'setuptools',
'requests >= 1.0',
'ruamel.yaml >= 0.12.4, < 0.12.5',
'rdflib >= 4.2.0, < 4.3.0',
'rdflib-jsonld >= 0.3.0, < 0.5.0',
'html5lib >= 0.90, <= 0.9999999',
'mistune >= 0.7.3, < 0.8',
'typing >= 3.5.2, < 3.6',
'CacheControl >= 0.11.7, < 0.12',
'lockfile >= 0.9']

install_requires.append("avro") # TODO: remove me once cwltool is
Expand All @@ -46,7 +48,7 @@
extras_require = {} # TODO: to be removed when the above is added

setup(name='schema-salad',
version='2.0',
version='2.1',
description='Schema Annotations for Linked Avro Data (SALAD)',
long_description=open(README).read(),
author='Common workflow language working group',
Expand Down

0 comments on commit 5a15c58

Please sign in to comment.