Skip to content
This repository has been archived by the owner on Sep 1, 2022. It is now read-only.

Commit

Permalink
add diffing logic
Browse files Browse the repository at this point in the history
  • Loading branch information
zkagin committed Aug 26, 2020
1 parent 8d749aa commit 8e44a79
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 6 deletions.
52 changes: 50 additions & 2 deletions google_classroom/endpoints/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sqlalchemy.schema import DropTable
from sqlalchemy.exc import NoSuchTableError, InvalidRequestError
from timer import elapsed
import endpoints


class EndPoint:
Expand Down Expand Up @@ -273,5 +274,52 @@ def callback(request_id, response, exception):
)
time.sleep(20)

def sync_data(self):
return "Data syncing is not yet available."
def differences_between_frames(self, df1, df2, left_on, right_on):
"""
Merges two dataframes and splits them by which one a row comes from.
Parameters:
df1: The first dataframe to be compared.
df2: The second dataframe to be compared.
left_on: The column in df1 to match the first dataframe to the second.
right_on: The column in df2 to match the second dataframe to the first.
Returns:
left_only: A dataframe containing data only found in df1.
right_only: A dataframe containing data only found in df2.
both: A dataframe containing data found in both df1 and df2.
"""
merged = pd.merge(
df1, df2, left_on=left_on, right_on=right_on, how="outer", indicator=True,
)
left_only = merged[merged["_merge"] == "left_only"].reset_index(drop=True)
right_only = merged[merged["_merge"] == "right_only"].reset_index(drop=True)
both = merged[merged["_merge"] == "both"].reset_index(drop=True)

return (left_only, right_only, both)

def return_cleaned_sync_data(self):
"""
Any cleaning that must be done to provide usable data for syncing.
Can be overwritten by subclasses in case of special logic.
"""
return self.return_all_data().astype("str")

def sync_data(self, data=None):
if data is None:
csv_name = f"{self.classname().lower()}.csv"
data = pd.read_csv(f"sync_files/{csv_name}").astype("str")
db_df = self.return_cleaned_sync_data()
aliases = endpoints.CourseAliases(self.service, self.sql, self.config)
alias_df = aliases.return_cleaned_sync_data()
db_df = pd.merge(
db_df, alias_df, left_on="courseId", right_on="courseId", how="inner"
)
data["course_alias"] = "d:" + data["course_alias"]

(left_only, right_only, _) = self.differences_between_frames(
data, db_df, "course_alias", "alias"
)
to_delete = right_only[self.to_delete_columns]
to_create = left_only[self.to_create_columns]
return (to_create, to_delete)
14 changes: 14 additions & 0 deletions google_classroom/endpoints/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ def __init__(self, service, sql, config):
"updateTime",
]
self.request_key = "courses"
self.to_delete_columns = ["courseId", "name", "section"]
self.to_create_columns = [
"course_alias",
"course_name",
"section_name",
"teacher_email",
]
self.batch_size = config.COURSES_BATCH_SIZE

def request_data(self, course_id=None, date=None, next_page_token=None):
Expand All @@ -31,3 +38,10 @@ def request_data(self, course_id=None, date=None, next_page_token=None):

def filter_data(self, dataframe):
return dataframe[dataframe.updateTime >= self.config.SCHOOL_YEAR_START]

def return_cleaned_sync_data(self):
df = self.return_all_data().astype("str")
df = df[df["courseState"] == "ACTIVE"]
df = df.rename(columns={"id": "courseId"})
df = df[["courseId", "name", "section"]]
return df.astype("str")
4 changes: 2 additions & 2 deletions google_classroom/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ def pull_data(config, creds, sql):

def sync_data(config, creds, sql):
classroom_service = build("classroom", "v1", credentials=creds)
result = Courses(classroom_service, sql, config).sync_data()
print(result)
(to_create, to_delete) = Courses(classroom_service, sql, config).sync_data()
print("Data syncing is not yet available.")


if __name__ == "__main__":
Expand Down
4 changes: 4 additions & 0 deletions google_classroom/sync_files/courses_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
course_alias,course_name,section_name,teacher_email
25601,Literature,1,teacher1@gmail.com
25602,English,2,teacher2@gmail.com
25603,Biology,3,teacher3@gmail.com
43 changes: 43 additions & 0 deletions tests/sync_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd

# Note on the data:
# They are designed to test a few different scenarios:
# 1. The "normal" flows.
# 2. Course data that is missing alias data (Physics should be ignored).
# 3. Alias data that is missing course data (d:111 should be ignored).
# 4. Archived courses (Paleontology should be ignored).

COURSE_DATA = pd.DataFrame(
{
"id": ["1", "2", "3", "4", "5"],
"name": ["Biology", "Math", "English", "Paleontology", "Physics"],
"courseState": ["ACTIVE", "ACTIVE", "ACTIVE", "ARCHIVED", "ACTIVE"],
"section": ["1", "1", "2", "3", "4"],
}
)
ALIAS_DATA = pd.DataFrame(
{
"courseId": ["1", "2", "3", "4", "0"],
"alias": ["d:123", "d:234", "d:345", "d:456", "d:111"],
}
)
SOURCE_DATA = pd.DataFrame(
{
"course_alias": ["123", "234", "678", "789"],
"course_name": ["Biology", "Math", "History", "Computer Science"],
"section_name": ["1", "1", "2", "2"],
"teacher_email": ["a@b.com", "a@b.com", "a@b.com", "a@b.com"],
}
)

TO_CREATE_SOLUTION = pd.DataFrame(
{
"course_alias": ["d:678", "d:789"],
"course_name": ["History", "Computer Science"],
"section_name": ["2", "2"],
"teacher_email": ["a@b.com", "a@b.com"],
}
)
TO_DELETE_SOLUTION = pd.DataFrame(
{"courseId": ["3"], "name": ["English"], "section": ["2"]}
)
17 changes: 15 additions & 2 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@
TOPIC_SOLUTION,
MEET_SOLUTION,
)
from sync_data import (
COURSE_DATA,
ALIAS_DATA,
SOURCE_DATA,
TO_CREATE_SOLUTION,
TO_DELETE_SOLUTION,
)


class TestPulls:
Expand Down Expand Up @@ -145,6 +152,7 @@ def generic_get_test(self, endpoint, solution, course_ids=[None], dates=[None]):
endpoint.table_name, con=self.sql.engine, schema=self.sql.schema
)
assert result.equals(solution)
endpoint._drop_table()


class TestSync:
Expand All @@ -154,5 +162,10 @@ def setup(self):
self.service = FakeService()

def test_sync_courses(self):
results = Courses(self.service, self.sql, self.config).sync_data()
assert results == "Data syncing is not yet available."
courses = Courses(self.service, self.sql, self.config)
aliases = CourseAliases(self.service, self.sql, self.config)
self.sql.insert_into(courses.table_name, COURSE_DATA)
self.sql.insert_into(aliases.table_name, ALIAS_DATA)
(to_create, to_delete) = courses.sync_data(SOURCE_DATA)
assert to_create.equals(TO_CREATE_SOLUTION)
assert to_delete.equals(TO_DELETE_SOLUTION)

0 comments on commit 8e44a79

Please sign in to comment.