Skip to content
This repository has been archived by the owner on Sep 1, 2022. It is now read-only.

Check differences between source data and the database #98

Merged
merged 2 commits into from
Sep 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions google_classroom/endpoints/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sqlalchemy.schema import DropTable
from sqlalchemy.exc import NoSuchTableError, InvalidRequestError
from timer import elapsed
import endpoints


class EndPoint:
Expand Down Expand Up @@ -273,5 +274,57 @@ def callback(request_id, response, exception):
)
time.sleep(20)

def sync_data(self):
return "Data syncing is not yet available."
def differences_between_frames(self, df1, df2, left_on, right_on):
"""
Merges two dataframes and splits them by which one a row comes from.

Parameters:
df1: The first dataframe to be compared.
df2: The second dataframe to be compared.
left_on: The column in df1 to match the first dataframe to the second.
right_on: The column in df2 to match the second dataframe to the first.

Returns:
left_only: A dataframe containing data only found in df1.
right_only: A dataframe containing data only found in df2.
both: A dataframe containing data found in both df1 and df2.
"""
merged = pd.merge(
df1,
df2,
left_on=left_on,
right_on=right_on,
how="outer",
indicator=True,
)
left_only = merged[merged["_merge"] == "left_only"].reset_index(drop=True)
right_only = merged[merged["_merge"] == "right_only"].reset_index(drop=True)
both = merged[merged["_merge"] == "both"].reset_index(drop=True)

return (left_only, right_only, both)

def return_cleaned_sync_data(self):
"""
Any cleaning that must be done to provide usable data for syncing.
Can be overwritten by subclasses in case of special logic.
"""
return self.return_all_data().astype("str")

def sync_data(self, data=None):
if data is None:
csv_name = f"{self.classname().lower()}.csv"
data = pd.read_csv(f"sync_files/{csv_name}").astype("str")
db_df = self.return_cleaned_sync_data()
aliases = endpoints.CourseAliases(self.service, self.sql, self.config)
alias_df = aliases.return_cleaned_sync_data()
db_df = pd.merge(
db_df, alias_df, left_on="courseId", right_on="courseId", how="inner"
)
data["alias"] = "d:" + data["alias"]

(left_only, right_only, _) = self.differences_between_frames(
data, db_df, "alias", "alias"
)
to_create = data[data.alias.isin(left_only.alias)].reset_index(drop=True)
to_delete = db_df[db_df.alias.isin(right_only.alias)].reset_index(drop=True)
return (to_create, to_delete)
7 changes: 7 additions & 0 deletions google_classroom/endpoints/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ def request_data(self, course_id=None, date=None, next_page_token=None):

def filter_data(self, dataframe):
return dataframe[dataframe.updateTime >= self.config.SCHOOL_YEAR_START]

def return_cleaned_sync_data(self):
df = self.return_all_data().astype("str")
df = df[df["courseState"] == "ACTIVE"]
df = df.rename(columns={"id": "courseId"})
df = df[["courseId", "name", "section"]]
return df.astype("str")
8 changes: 4 additions & 4 deletions google_classroom/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main(config):
sql = db_generator(config)
pull_data(config, creds, sql)
if config.SYNC:
sync_data(config, creds, sql)
sync_all_data(config, creds, sql)


def pull_data(config, creds, sql):
Expand Down Expand Up @@ -160,10 +160,10 @@ def pull_data(config, creds, sql):
Meet(admin_reports_service, sql, config).batch_pull_data()


def sync_data(config, creds, sql):
def sync_all_data(config, creds, sql):
classroom_service = build("classroom", "v1", credentials=creds)
result = Courses(classroom_service, sql, config).sync_data()
print(result)
(to_create, to_delete) = Courses(classroom_service, sql, config).sync_data()
print("Data syncing is not yet available.")


if __name__ == "__main__":
Expand Down
4 changes: 4 additions & 0 deletions google_classroom/sync_files/courses_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
alias,name,section,teacher_email
25601,Literature,1,teacher1@gmail.com
25602,English,2,teacher2@gmail.com
25603,Biology,3,teacher3@gmail.com
43 changes: 43 additions & 0 deletions tests/sync_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd

# Note on the data:
# They are designed to test a few different scenarios:
# 1. The "normal" flows.
# 2. Course data that is missing alias data (Physics should be ignored).
# 3. Alias data that is missing course data (d:111 should be ignored).
# 4. Archived courses (Paleontology should be ignored).

COURSE_DATA = pd.DataFrame(
{
"id": ["1", "2", "3", "4", "5"],
"name": ["Biology", "Math", "English", "Paleontology", "Physics"],
"courseState": ["ACTIVE", "ACTIVE", "ACTIVE", "ARCHIVED", "ACTIVE"],
"section": ["1", "1", "2", "3", "4"],
}
)
ALIAS_DATA = pd.DataFrame(
{
"courseId": ["1", "2", "3", "4", "0"],
"alias": ["d:123", "d:234", "d:345", "d:456", "d:111"],
}
)
SOURCE_DATA = pd.DataFrame(
{
"alias": ["123", "234", "678", "789"],
"name": ["Biology", "Math", "History", "Computer Science"],
"section": ["1", "1", "2", "2"],
"teacher_email": ["a@b.com", "a@b.com", "a@b.com", "a@b.com"],
}
)

TO_CREATE_SOLUTION = pd.DataFrame(
{
"alias": ["d:678", "d:789"],
"name": ["History", "Computer Science"],
"section": ["2", "2"],
"teacher_email": ["a@b.com", "a@b.com"],
}
)
TO_DELETE_SOLUTION = pd.DataFrame(
{"courseId": ["3"], "name": ["English"], "section": ["2"], "alias": ["d:345"]}
)
17 changes: 15 additions & 2 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@
TOPIC_SOLUTION,
MEET_SOLUTION,
)
from sync_data import (
COURSE_DATA,
ALIAS_DATA,
SOURCE_DATA,
TO_CREATE_SOLUTION,
TO_DELETE_SOLUTION,
)


class TestPulls:
Expand Down Expand Up @@ -145,6 +152,7 @@ def generic_get_test(self, endpoint, solution, course_ids=[None], dates=[None]):
endpoint.table_name, con=self.sql.engine, schema=self.sql.schema
)
assert result.equals(solution)
endpoint._drop_table()


class TestSync:
Expand All @@ -154,5 +162,10 @@ def setup(self):
self.service = FakeService()

def test_sync_courses(self):
results = Courses(self.service, self.sql, self.config).sync_data()
assert results == "Data syncing is not yet available."
courses = Courses(self.service, self.sql, self.config)
aliases = CourseAliases(self.service, self.sql, self.config)
self.sql.insert_into(courses.table_name, COURSE_DATA)
self.sql.insert_into(aliases.table_name, ALIAS_DATA)
(to_create, to_delete) = courses.sync_data(SOURCE_DATA)
assert to_create.equals(TO_CREATE_SOLUTION)
assert to_delete.equals(TO_DELETE_SOLUTION)