Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

to s3 #2

Merged
merged 8 commits into from
Jul 10, 2017
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions datapackage_pipelines_aws/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +0,0 @@
# -*- coding: utf-8 -*-

import io
import os

from .generator import Generator

VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION')

__version__ = io.open(VERSION_FILE, encoding='utf-8').readline().strip()

__all__ = ['Generator']
13 changes: 13 additions & 0 deletions datapackage_pipelines_aws/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os
import logging


def generate_path(file_path, base_path='', datapackage={}):
format_params = {'version': 'latest'}
format_params.update(datapackage)
try:
base_path = base_path.format(**format_params)
except KeyError:
logging.error('datapackage.json is missing property: %s' % KeyError)
raise
return os.path.join(base_path, file_path)
25 changes: 25 additions & 0 deletions datapackage_pipelines_aws/processors/to_s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import boto3

from datapackage_pipelines.lib.dump.dumper_base import CSVDumper
from datapackage_pipelines_aws import helpers


class S3Dumper(CSVDumper):

def initialize(self, params):
super(S3Dumper, self).initialize(params)
self.bucket = params['bucket']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about AWS access key and secret?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@akariv boto3 takes care of credential if they are set-up. It looks up in aws config file or searches for ENV variables http://boto3.readthedocs.io/en/latest/guide/configuration.html#aws-config-file
But we can have them as a part of the spec as well if that is a case

self.client = boto3.client('s3')
self.base_path = params.get('path', '')

def prepare_datapackage(self, datapackage, _):
self.datapackage = datapackage
return datapackage

def write_file_to_output(self, filename, path):
key = helpers.generate_path(path, self.base_path, self.datapackage)
self.client.put_object(
Body=open(filename, 'rb'), Bucket=self.bucket, Key=key)


S3Dumper()()
28 changes: 28 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import unittest

from datapackage_pipelines_aws import helpers

class TestToS3Proccessor(unittest.TestCase):
def test_generate_path(self):
inpath = 'datapackage.json'
basepath = 'my/test/path'
expected = 'my/test/path/datapackage.json'
datapackage = {'name': 'my-package'}
out = helpers.generate_path(inpath, basepath, datapackage)
self.assertEquals(out, expected)

def test_generate_path_with_formated_string(self):
inpath = 'datapackage.json'
basepath = 'my/test/path/{owner}/{name}/{version}'
expected = 'my/test/path/me/my-package/latest/datapackage.json'
datapackage = {'name': 'my-package', 'owner': 'me'}
out = helpers.generate_path(inpath, basepath, datapackage)
self.assertEquals(out, expected)

def test_generate_path_errors_without_owner_in_datapackage(self):
inpath = 'datapackage.json'
basepath = 'my/test/path/{owner}/{name}/{version}'
expected = 'my/test/path/me/my-package/latest/datapackage.json'
datapackage = {'name': 'my-package',}
with self.assertRaises(KeyError) as context:
helpers.generate_path(inpath, basepath, datapackage)
75 changes: 75 additions & 0 deletions tests/test_to_s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import json
import os
import unittest

from moto import mock_s3
import boto3

from datapackage_pipelines.utilities.lib_test_helpers import (
mock_processor_test
)

import datapackage_pipelines_aws.processors

import logging
log = logging.getLogger(__name__)


class TestToS3Proccessor(unittest.TestCase):
def setUp(self):
self.bucket = 'my.test.bucket'
self.resources = [{
'name': 'resource',
"format": "csv",
"path": "data/test.csv",
"schema": {
"fields": [
{
"name": "Date",
"type": "date",
},
{
"name": "Name",
"type": "string",
}
]
}
}]
self.datapackage = {
'owner': 'me',
'name': 'my-datapackage',
'project': 'my-project',
'resources': self.resources
}
self.params = {
'bucket': self.bucket,
'path': 'my/test/path/{owner}/{name}/{version}'
}
# Path to the processor we want to test
self.processor_dir = \
os.path.dirname(datapackage_pipelines_aws.processors.__file__)
self.processor_path = os.path.join(self.processor_dir, 'to_s3.py')


@mock_s3
def test_puts_datapackage_on_s3(self):
s3 = boto3.resource('s3')
s3.create_bucket(Bucket=self.bucket)

mock_processor_test(self.processor_path,
(self.params,
self.datapackage,
iter([])))
Copy link
Member

@akariv akariv Jul 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a list of lists of the resource data. e.g.:

[ 
  [ {'Name': 'a', 'Date': datetime.datetime.now().date()}, ... 
  ] 
]

Copy link
Contributor Author

@zelima zelima Jul 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm getting this error in that case

def handle_resources(self, datapackage,
                         resource_iterator,
                         parameters, stats):
        datapackage['count_of_rows'] = 0
        datapackage['hash'] = hashlib.md5()
        for resource in resource_iterator:
>           resource_spec = resource.spec
E           AttributeError: 'list' object has no attribute 'spec'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, try this:

res =  [ {'Name': 'a', 'Date': datetime.datetime.now().date()}, ... ] 
res.spec = datapackage['resources'][0]
res_iter = [res]

We should probably find a wrapper for that.


keys = []
for bucket in s3.buckets.all():
for key in bucket.objects.all():
keys.append(key.key)

self.assertEquals(len(keys), 1)
res_path = 'my/test/path/me/my-datapackage/latest/datapackage.json'
self.assertEqual(res_path,keys[0])

content = s3.Object(self.bucket, res_path).get()['Body']\
.read().decode("utf-8")
self.assertDictEqual(json.loads(content), self.datapackage)
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ envlist=

[testenv]
deps=
mock
requests-mock
google-compute-engine
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

google-compute-engine?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

boto was complaining on Travis

Traceback:
tests/test_to_s3.py:5: in <module>
    from moto import mock_s3
.tox/py/lib/python3.6/site-packages/moto/__init__.py:9: in <module>
    from .autoscaling import mock_autoscaling, mock_autoscaling_deprecated  # flake8: noqa
.tox/py/lib/python3.6/site-packages/moto/autoscaling/__init__.py:2: in <module>
    from .models import autoscaling_backends
.tox/py/lib/python3.6/site-packages/moto/autoscaling/models.py:2: in <module>
    from boto.ec2.blockdevicemapping import BlockDeviceType, BlockDeviceMapping
.tox/py/lib/python3.6/site-packages/boto/__init__.py:1216: in <module>
    boto.plugin.load_plugins(config)
.tox/py/lib/python3.6/site-packages/boto/plugin.py:93: in load_plugins
    _import_module(file)
.tox/py/lib/python3.6/site-packages/boto/plugin.py:75: in _import_module
    return imp.load_module(name, file, filename, data)
.tox/py/lib/python3.6/imp.py:234: in load_module
    return load_source(name, filename, file)
.tox/py/lib/python3.6/imp.py:172: in load_source
    module = _load(spec)
/usr/lib/python2.7/dist-packages/google_compute_engine/boto/compute_auth.py:19: in <module>
    from google_compute_engine import logger
E   ModuleNotFoundError: No module named 'google_compute_engine'

This answer summarises the reason GoogleCloudPlatform/compute-image-packages#262 (comment)

moto
pytest
pytest-cov
coverage
Expand Down