Skip to content

Commit

Permalink
adding initial cli using click, tests todo
Browse files Browse the repository at this point in the history
addressing #11
  • Loading branch information
Sparrow0hawk committed Aug 18, 2020
1 parent 480372e commit 7f87d95
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 78 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def readme():
setup(
name='topic_model_to_Shiny_app',
url='https://github.com/Sparrow0hawk/topic_model_to_Shiny_app',
version="1.3.0",
version="1.4.0-dev",
author='Alex Coleman',
author_email='a.coleman1@leeds.ac.uk',
description='An implementation of Gensim topic modelling that identifies the number of topics using coherence scores.',
Expand Down
58 changes: 0 additions & 58 deletions topic_model_to_Shiny_app/master_run.py

This file was deleted.

27 changes: 27 additions & 0 deletions topic_model_to_Shiny_app/tests/cli_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import unittest
from unittest.mock import patch
import pkg_resources
from click.testing import CliRunner
from topic2shiny import main

resource_package = 'topic_model_to_Shiny_app'

test_dir = os.path.dirname(os.path.abspath(__file__))

class CLI_test(unittest.TestCase):

def test_CLI(self):
# these are examples and will fail
# TODO: update these to actually test CLI
runner = CliRunner()
result = runner.invoke(cli, ['--debug', 'sync'])
assert result.exit_code == 0
assert 'Debug mode is on' in result.output
assert 'Syncing' in result.output



if __name__ == "__main__":

unittest.main(verbosity=2)
23 changes: 12 additions & 11 deletions topic_model_to_Shiny_app/tests/complete_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@ def setUp(self):

self.model = self.class_(corpus, id2word=dictionary, num_topics=3)

@patch('builtins.input',return_value=pkg_resources.resource_filename(resource_package, 'tests/test_data/test_data.csv'))
def test_initial_data_import(self, input):
def test_initial_data_import(self):

self.data = text_preprocessing.initial_data_import()

print(self.data)
self.data = text_preprocessing.initial_data_import(data_path=pkg_resources.resource_filename(resource_package, 'tests/test_data/test_data.csv'))

self.assertTrue(isinstance(self.data, pd.DataFrame))

with self.assertRaises(SystemExit) as cm:

self.data = text_preprocessing.initial_data_import(data_path=pkg_resources.resource_filename(resource_package, 'tests/test_data/test_data2.csv'))

self.assertEqual(cm.exception.code, 0)

def test_validate_input_data(self):

self.data = text_preprocessing.validate_input_data(
Expand All @@ -59,10 +62,9 @@ def test_remove_stopwords(self):

self.assertEqual(text_preprocessing.remove_stopwords([['the','and','jackal']]), [['jackal']])

@patch('builtins.input',return_value=pkg_resources.resource_filename(resource_package,'tests/test_data/test_data.csv'))
def test_full_preprocessing(self, input):
def test_full_preprocessing(self):

self.data = text_preprocessing.preprocessing()
self.data = text_preprocessing.preprocessing(data_path=pkg_resources.resource_filename(resource_package,'tests/test_data/test_data.csv'))

self.assertTrue(isinstance(self.data, pd.DataFrame))

Expand Down Expand Up @@ -168,12 +170,11 @@ def test_get_top3(self):



@patch('builtins.input', side_effect=[pkg_resources.resource_filename(resource_package, 'tests/test_data/test_data.csv')])
def test_int(self, input):
def test_int(self):
# run preprocessing function
# should save data into right places for next function

self.data = text_preprocessing.preprocessing()
self.data = text_preprocessing.preprocessing(data_path=pkg_resources.resource_filename(resource_package, 'tests/test_data/test_data.csv'))

# run topic selector function using output from above preprocessing
self.data = topic_number_selex.topic_number_selector(self.data,
Expand Down
26 changes: 18 additions & 8 deletions topic_model_to_Shiny_app/text_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# script for preprocessing text data for LDA
# import libraries
import sys
import os
import numpy as np
import pandas as pd
import gensim
Expand All @@ -17,10 +18,10 @@


# main function that performs preprocessing
def preprocessing():
def preprocessing(data_path : str):
"""Complete preprocessing script that outputs dataframe with new tokens column."""

working_dataframe = initial_data_import()
working_dataframe = initial_data_import(data_path)

validated_dataframe = validate_input_data(working_dataframe)

Expand Down Expand Up @@ -51,18 +52,27 @@ def preprocessing():
return validated_dataframe


def initial_data_import():
"""Imports initial .csv dataframe converts into pandas object."""
def initial_data_import(data_path : str):
"""
Imports initial .csv dataframe converts into pandas object.
:param data_path: this is a string of the full path to data file
:returns: pd.DataFrame object
"""

print('Beginning text preprocessing.')
# specifying data directory

data_path = str(input('Specify the full path to the input :'))

# validate that the file format is .csv
if os.path.isfile(data_path):

if str(data_path[-4:]) != '.csv':
print('Please confirm you are passing a .csv file type.')
original_frame = pd.read_csv(data_path.strip(), encoding='latin1')

else:
print('File does not exist.')
print('Please check the directory you passed: \n',data_path)
sys.exit(0)

# reading data
original_frame = pd.read_csv(data_path.strip(), encoding='latin1')
Expand Down
74 changes: 74 additions & 0 deletions topic_model_to_Shiny_app/topic2shiny.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import time
# consider security imp of subprocess
import subprocess
import click
from topic_model_to_Shiny_app.text_preprocessing import preprocessing
from topic_model_to_Shiny_app.topic_number_selex import topic_number_selector
from topic_model_to_Shiny_app.dominant_topic_processing import topic_processing
import pkg_resources
import pathlib

# specify top level package folder
resource_package = 'topic_model_to_Shiny_app'

@click.command()
@click.option('-i', '--input-path', prompt='Specify input data path', type=str,
help='The absolute path to the input data file.')
@click.option('-o', '--output-directory', prompt='Specify directory for outputs', type=str,
help='The absolute path to an directory for outputs, will create a new directory if required.')
@click.option('-n', '--num-runs', prompt='Please specify the number of LDA repeats for topic number selection', type=int,
help='The number of LDA repeat runs desired to establish working topic number.')
@click.option('-p', '--pretrained', default=False, type=bool,
help='A boolean to check if user wishes to use a pretrained model.')
@click.option('-m', '--pretrained-model', type=str,
help='The absolute path to the pretrained model file.')
@click.option('-s', '--shiny-start', default=True, type=bool,
help='A boolean to confirm if user wishes Shiny app instance to start.')

def main(input_path : str,
output_directory : str,
num_runs : int,
pretrained : bool,
pretrained_model : str,
shiny_start : bool):
"""
The main function handling CLI
\b
:param input_path: this is a first param
:param output_directory: this is a second param
:param n_runs: an integer of the number of repeat runs of LDA to perform
:param pretrained: a boolean of whether user wants to use a pretrained model
:param pretrained_model: a string of the path to the pretrained model
:param shiny_start: a boolean of whether to start shiny app at the end of processing
:returns: None (loads shiny app)
"""
# take the output directory and create it if it doesn't exist
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)

# check if you need a new model
if not pretrained:

transformed_data = preprocessing(input_path)

topic_number_selector(processed_data = transformed_data, output_path=output_directory,
narrow_iter = num_runs, wide_iter = 100)

else:
# TODO: write in how to load a pretrained model
pass

topic_processing(output_path=output_directory)

if shiny_start:

print('Initialising Shiny dashboard.')

shiny_dir = pkg_resources.resource_filename(resource_package, "Shiny/combined.R")

# process to call shiny app
subprocess.call([shiny_dir])

if __name__ == '__main__':

main()

0 comments on commit 7f87d95

Please sign in to comment.