diff --git a/google_classroom/endpoints/base.py b/google_classroom/endpoints/base.py index e86c91c..6a0c70d 100644 --- a/google_classroom/endpoints/base.py +++ b/google_classroom/endpoints/base.py @@ -8,6 +8,7 @@ from sqlalchemy.schema import DropTable from sqlalchemy.exc import NoSuchTableError, InvalidRequestError from timer import elapsed +import endpoints class EndPoint: @@ -273,5 +274,52 @@ def callback(request_id, response, exception): ) time.sleep(20) - def sync_data(self): - return "Data syncing is not yet available." + def differences_between_frames(self, df1, df2, left_on, right_on): + """ + Merges two dataframes and splits them by which one a row comes from. + + Parameters: + df1: The first dataframe to be compared. + df2: The second dataframe to be compared. + left_on: The column in df1 to match the first dataframe to the second. + right_on: The column in df2 to match the second dataframe to the first. + + Returns: + left_only: A dataframe containing data only found in df1. + right_only: A dataframe containing data only found in df2. + both: A dataframe containing data found in both df1 and df2. + """ + merged = pd.merge( + df1, df2, left_on=left_on, right_on=right_on, how="outer", indicator=True, + ) + left_only = merged[merged["_merge"] == "left_only"].reset_index(drop=True) + right_only = merged[merged["_merge"] == "right_only"].reset_index(drop=True) + both = merged[merged["_merge"] == "both"].reset_index(drop=True) + + return (left_only, right_only, both) + + def return_cleaned_sync_data(self): + """ + Any cleaning that must be done to provide usable data for syncing. + Can be overwritten by subclasses in case of special logic. + """ + return self.return_all_data().astype("str") + + def sync_data(self, data=None): + if data is None: + csv_name = f"{self.classname().lower()}.csv" + data = pd.read_csv(f"sync_files/{csv_name}").astype("str") + db_df = self.return_cleaned_sync_data() + aliases = endpoints.CourseAliases(self.service, self.sql, self.config) + alias_df = aliases.return_cleaned_sync_data() + db_df = pd.merge( + db_df, alias_df, left_on="courseId", right_on="courseId", how="inner" + ) + data["course_alias"] = "d:" + data["course_alias"] + + (left_only, right_only, _) = self.differences_between_frames( + data, db_df, "course_alias", "alias" + ) + to_delete = right_only[self.to_delete_columns] + to_create = left_only[self.to_create_columns] + return (to_create, to_delete) diff --git a/google_classroom/endpoints/course.py b/google_classroom/endpoints/course.py index b87c293..6ec3dc7 100644 --- a/google_classroom/endpoints/course.py +++ b/google_classroom/endpoints/course.py @@ -22,6 +22,13 @@ def __init__(self, service, sql, config): "updateTime", ] self.request_key = "courses" + self.to_delete_columns = ["courseId", "name", "section"] + self.to_create_columns = [ + "course_alias", + "course_name", + "section_name", + "teacher_email", + ] self.batch_size = config.COURSES_BATCH_SIZE def request_data(self, course_id=None, date=None, next_page_token=None): @@ -31,3 +38,10 @@ def request_data(self, course_id=None, date=None, next_page_token=None): def filter_data(self, dataframe): return dataframe[dataframe.updateTime >= self.config.SCHOOL_YEAR_START] + + def return_cleaned_sync_data(self): + df = self.return_all_data().astype("str") + df = df[df["courseState"] == "ACTIVE"] + df = df.rename(columns={"id": "courseId"}) + df = df[["courseId", "name", "section"]] + return df.astype("str") diff --git a/google_classroom/main.py b/google_classroom/main.py index 47e8857..2c5ea8e 100644 --- a/google_classroom/main.py +++ b/google_classroom/main.py @@ -165,8 +165,8 @@ def pull_data(config, creds, sql): def sync_data(config, creds, sql): classroom_service = build("classroom", "v1", credentials=creds) - result = Courses(classroom_service, sql, config).sync_data() - print(result) + (to_create, to_delete) = Courses(classroom_service, sql, config).sync_data() + print("Data syncing is not yet available.") if __name__ == "__main__": diff --git a/google_classroom/sync_files/courses_sample.csv b/google_classroom/sync_files/courses_sample.csv index e69de29..1aeeb8f 100644 --- a/google_classroom/sync_files/courses_sample.csv +++ b/google_classroom/sync_files/courses_sample.csv @@ -0,0 +1,4 @@ +course_alias,course_name,section_name,teacher_email +25601,Literature,1,teacher1@gmail.com +25602,English,2,teacher2@gmail.com +25603,Biology,3,teacher3@gmail.com \ No newline at end of file diff --git a/tests/sync_data.py b/tests/sync_data.py new file mode 100644 index 0000000..ad6e07f --- /dev/null +++ b/tests/sync_data.py @@ -0,0 +1,43 @@ +import pandas as pd + +# Note on the data: +# They are designed to test a few different scenarios: +# 1. The "normal" flows. +# 2. Course data that is missing alias data (Physics should be ignored). +# 3. Alias data that is missing course data (d:111 should be ignored). +# 4. Archived courses (Paleontology should be ignored). + +COURSE_DATA = pd.DataFrame( + { + "id": ["1", "2", "3", "4", "5"], + "name": ["Biology", "Math", "English", "Paleontology", "Physics"], + "courseState": ["ACTIVE", "ACTIVE", "ACTIVE", "ARCHIVED", "ACTIVE"], + "section": ["1", "1", "2", "3", "4"], + } +) +ALIAS_DATA = pd.DataFrame( + { + "courseId": ["1", "2", "3", "4", "0"], + "alias": ["d:123", "d:234", "d:345", "d:456", "d:111"], + } +) +SOURCE_DATA = pd.DataFrame( + { + "course_alias": ["123", "234", "678", "789"], + "course_name": ["Biology", "Math", "History", "Computer Science"], + "section_name": ["1", "1", "2", "2"], + "teacher_email": ["a@b.com", "a@b.com", "a@b.com", "a@b.com"], + } +) + +TO_CREATE_SOLUTION = pd.DataFrame( + { + "course_alias": ["d:678", "d:789"], + "course_name": ["History", "Computer Science"], + "section_name": ["2", "2"], + "teacher_email": ["a@b.com", "a@b.com"], + } +) +TO_DELETE_SOLUTION = pd.DataFrame( + {"courseId": ["3"], "name": ["English"], "section": ["2"]} +) diff --git a/tests/test_all.py b/tests/test_all.py index 506ca04..6a618ed 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -35,6 +35,13 @@ TOPIC_SOLUTION, MEET_SOLUTION, ) +from sync_data import ( + COURSE_DATA, + ALIAS_DATA, + SOURCE_DATA, + TO_CREATE_SOLUTION, + TO_DELETE_SOLUTION, +) class TestPulls: @@ -145,6 +152,7 @@ def generic_get_test(self, endpoint, solution, course_ids=[None], dates=[None]): endpoint.table_name, con=self.sql.engine, schema=self.sql.schema ) assert result.equals(solution) + endpoint._drop_table() class TestSync: @@ -154,5 +162,10 @@ def setup(self): self.service = FakeService() def test_sync_courses(self): - results = Courses(self.service, self.sql, self.config).sync_data() - assert results == "Data syncing is not yet available." + courses = Courses(self.service, self.sql, self.config) + aliases = CourseAliases(self.service, self.sql, self.config) + self.sql.insert_into(courses.table_name, COURSE_DATA) + self.sql.insert_into(aliases.table_name, ALIAS_DATA) + (to_create, to_delete) = courses.sync_data(SOURCE_DATA) + assert to_create.equals(TO_CREATE_SOLUTION) + assert to_delete.equals(TO_DELETE_SOLUTION)