Skip to content
This repository has been archived by the owner on Sep 1, 2022. It is now read-only.

Commit

Permalink
use pandas instead of sql to deduplicate
Browse files Browse the repository at this point in the history
  • Loading branch information
zkagin committed Oct 14, 2020
1 parent e05d524 commit 4342097
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 26 deletions.
14 changes: 9 additions & 5 deletions google_classroom/endpoints/submission.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from endpoints.base import EndPoint
from sqlalchemy import text


class StudentSubmissions(EndPoint):
Expand Down Expand Up @@ -114,7 +114,11 @@ def preprocess_records(self, records):
return new_records

def perform_cleanup(self):
with open("sql/remove_duplicates.sql") as sql_file:
escaped_sql = text(sql_file.read())
with self.sql.engine.connect().execution_options(autocommit=True) as conn:
conn.execute(escaped_sql)
logging.info(f"{self.classname()}: Removing duplicates.")
all_data = self.return_all_data()
cleaned_data = all_data.drop_duplicates(subset=["id"], keep="last")
log = f"Condensed database from {len(all_data)} to {len(cleaned_data)} rows."
logging.debug(f"{self.classname()}: " + log)
self.sql.insert_into(
self.table_name, cleaned_data, if_exists="replace", chunksize=10000
)
4 changes: 1 addition & 3 deletions google_classroom/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,7 @@ def pull_data(config, creds, sql):

# Get student coursework submissions
if config.PULL_SUBMISSIONS:
StudentSubmissions(classroom_service, sql, config).batch_pull_data(
course_ids, overwrite=False
)
StudentSubmissions(classroom_service, sql, config).batch_pull_data(course_ids)

# Get Meet data
if config.PULL_MEET:
Expand Down
18 changes: 0 additions & 18 deletions google_classroom/sql/remove_duplicates.sql

This file was deleted.

0 comments on commit 4342097

Please sign in to comment.