From 71e33a3abe4cf18e6a49b8c7d212958b346ba395 Mon Sep 17 00:00:00 2001
From: iris <84595986+iris-garden@users.noreply.github.com>
Date: Tue, 20 Aug 2024 13:33:09 -0400
Subject: [PATCH] wip

---
 discourse-export-script/.gitignore            |   3 +
 discourse-export-script/discourse_download.py | 251 ++++++++++++++++++
 discourse-export-script/github_upload.py      | 231 ++++++++++++++++
 .../org.mockito.plugins.MockMaker             |   2 +-
 4 files changed, 486 insertions(+), 1 deletion(-)
 create mode 100644 discourse-export-script/.gitignore
 create mode 100644 discourse-export-script/discourse_download.py
 create mode 100644 discourse-export-script/github_upload.py

diff --git a/discourse-export-script/.gitignore b/discourse-export-script/.gitignore
new file mode 100644
index 00000000000..6b7d7596f4a
--- /dev/null
+++ b/discourse-export-script/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+discourse-export/
+uploaded/
diff --git a/discourse-export-script/discourse_download.py b/discourse-export-script/discourse_download.py
new file mode 100644
index 00000000000..567eec6c5f8
--- /dev/null
+++ b/discourse-export-script/discourse_download.py
@@ -0,0 +1,251 @@
+from argparse import ArgumentParser
+from asyncio import gather, run, sleep
+from dataclasses import dataclass
+from datetime import datetime
+from html.parser import HTMLParser
+from json import dump, loads
+from re import sub
+from typing import Any, Callable, Dict, List, Tuple, TypeVar
+
+from aiohttp import ClientSession
+
+strptime = datetime.strptime
+strftime = datetime.strftime
+
+# constants
+POST_LINK_ID = "f4706281-cc60-4ff0-a0b6-b803683cc24b"
+COMMENT_END_ID = "917f6034-2117-4a8c-bb42-b27fd7fb5e83"
+
+# types
+CallbackResponse = TypeVar("CallbackResponse")
+Callback = Callable[[Dict[str, Any]], CallbackResponse]
+
+
+@dataclass(frozen=True)
+class DiscoursePost:
+    id: int
+    topic_id: int
+    username: str
+    created_at: str
+    html: str
+
+
+@dataclass(frozen=True)
+class DiscourseTopic:
+    id: int
+    slug: str
+    title: str
+    html: str
+
+
+# html parser
+class DiscourseHTMLParser(HTMLParser):
+    def __init__(self: "DiscourseHTMLParser") -> None:
+        super().__init__()
+        self.output_html = ""
+        # relative file links starting with /
+        self.relative_link = False
+        # link previews and quotes
+        self.aside = False
+        self.aside_src = None
+        self.aside_src_written = False
+        self.aside_header = False
+        self.aside_header_link = False
+        self.aside_header_link_written = False
+        # code blocks
+        self.code_block_pre = False
+        self.code_block_code = False
+        # @ mentions
+        self.mention = False
+
+    def _decl_handler(self: "DiscourseHTMLParser", decl: str) -> None:
+        self.output_html += f"<!{decl}>"
+
+    def _ref_handler(self: "DiscourseHTMLParser", name: str) -> None:
+        self.output_html += f"&{name};"
+
+    def _write_starttag(self: "DiscourseHTMLParser", attrs: List[Tuple[str, str]], tag: str, suffix: str) -> None:
+        attr_str_prefix = " " if len(attrs) > 0 else ""
+        attr_str = " ".join([f'{key}="{value}"' for key, value in attrs])
+        self.output_html += f"<{tag}{attr_str_prefix}{attr_str}{suffix}>"
+
+    def _starttag_handler(suffix: str = "") -> None:
+        def inner(self: "DiscourseHTMLParser", tag: str, attrs: List[Tuple[str, str]]) -> None:
+            attr_dict = dict(attrs)
+            if ((not self.aside) or self.aside_header) and tag == "a":
+                if self.aside_header and not self.aside_header_link_written:
+                    self.aside_header_link = True
+                link = attr_dict.get("href", "")
+                if "mention" in attr_dict.get("class", ""):
+                    self.mention = True
+                elif link.startswith("/"):
+                    self.relative_link = True
+                elif "https://discuss.hail.is/t/" in link:
+                    slug = link.removeprefix("https://discuss.hail.is/t/").split("/")[0]
+                    self.output_html += f'<a href="{POST_LINK_ID}/{slug}">'
+                else:
+                    self._write_starttag(attrs, tag, suffix)
+            elif (
+                self.aside
+                and self.aside_src is None
+                and (tag == "header" or (tag == "div" and "title" in attr_dict.get("class", "")))
+            ):
+                self.aside_header = True
+                self.output_html += "\n"
+            elif self.aside_header and tag == "blockquote":
+                self.aside = False
+                self.aside_header = False
+                self.aside_header_link_written = False
+                self._write_starttag(attrs, tag, suffix)
+            elif self.aside_header and tag == "article":
+                self.aside_header = False
+                self.aside_header_link_written = False
+            elif not self.aside:
+                if tag == "aside":
+                    self.aside = True
+                    onebox_src = attr_dict.get("data-onebox-src", None)
+                    if onebox_src is not None:
+                        self.aside_src = onebox_src
+                        self.output_html += f'\n<a href="{onebox_src}">'
+                elif tag == "pre":
+                    self.code_block_pre = True
+                elif self.code_block_pre:
+                    if tag == "code":
+                        self.output_html += "\n\n```python\n"
+                        self.code_block_code = True
+                else:
+                    self._write_starttag(attrs, tag, suffix)
+
+        return inner
+
+    handle_charref = _ref_handler
+    handle_decl = _decl_handler
+    handle_entityref = _ref_handler
+    handle_startendtag = _starttag_handler(" /")
+    handle_starttag = _starttag_handler()
+    unknown_decl = _decl_handler
+
+    def handle_comment(self: "DiscourseHTMLParser", data: str) -> None:
+        self.output_html += f"<!--{data}-->"
+
+    def handle_data(self: "DiscourseHTMLParser", data: str) -> None:
+        if "https://discuss.hail.is/t/" in data:
+            data = sub('https://discuss.hail.is/t/([A-Za-z0-9\\-]*?)', f'{POST_LINK_ID}/\\1', data)
+        if self.mention:
+            self.output_html += f'{data.partition("@")[2]}'
+        elif self.aside_src is not None and not self.aside_src_written:
+            self.output_html += self.aside_src
+            self.aside_src_written = True
+        elif (not self.aside) or self.aside_header_link:
+            self.output_html += data
+
+    def handle_endtag(self: "DiscourseHTMLParser", tag: str) -> None:
+        if ((not self.aside) or self.aside_header) and tag == "a":
+            if self.mention:
+                self.mention = False
+            elif self.relative_link:
+                self.relative_link = False
+            else:
+                if self.aside_header_link:
+                    self.aside_header_link = False
+                    self.aside_header_link_written = True
+                self.output_html += "</a>"
+        elif tag == "aside":
+            self.aside = False
+            if self.aside_src is not None:
+                self.output_html += "</a>\n"
+                self.aside_src = None
+                self.aside_src_written = False
+                self.aside_header_link_written = True
+        elif not self.aside:
+            if tag == "pre":
+                self.code_block_pre = False
+            elif self.code_block_pre:
+                if tag == "code":
+                    self.output_html += "\n```\n\n"
+                    self.code_block_code = False
+            else:
+                self.output_html += f"</{tag}>"
+
+    def handle_pi(self: "DiscourseHTMLParser", data: str) -> None:
+        self.output_html += f"<?{data}>"
+
+
+# main script
+async def main(discourse_page: int) -> None:
+    async with ClientSession() as session:
+        pages = await run_tasks(
+            [parse_page(discourse_page, session)]
+            # [parse_page(page, session) for page in range(discourse_page + 1)]
+        )
+        topics = await run_tasks([
+            parse_topic(topic["id"], session) for page in pages for topic in page["topic_list"]["topics"]
+        ])
+        posts = await run_tasks([
+            parse_post(post["id"], session) for topic in topics for post in topic["post_stream"]["posts"]
+        ])
+
+        topic_acc = {topic["id"]: {"fields": topic, "posts": []} for topic in topics}
+        for post in posts:
+            topic_acc[post.topic_id]["posts"].append(post)
+
+        topics = []
+        for topic_id, topic in topic_acc.items():
+            if topic["fields"]["slug"] != "welcome-to-the-hail-community":
+                topic_html = ""
+                for idx, post in enumerate(topic["posts"]):
+                    parser = DiscourseHTMLParser()
+                    parser.feed(post.html)
+                    topic_html += f"> [!NOTE]\n> The following post was exported from discuss.hail.is, a forum for asking questions about Hail which has since been deprecated.\n\n## ({strptime(post.created_at, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%b %d, %Y at %H:%M')}) {post.username} said:\n{parser.output_html} {COMMENT_END_ID if idx < (len(topic['posts']) - 1) else ''}"
+                with open(f'./discourse-export/{topic["fields"]["id"]:04}_{topic["fields"]["slug"]}.json', 'w') as file:
+                    dump(
+                        {
+                            "id": topic["fields"]["id"],
+                            "slug": topic["fields"]["slug"],
+                            "title": topic["fields"]["title"],
+                            "html": topic_html,
+                        },
+                        file,
+                    )
+
+
+async def run_tasks(tasks):
+    result = []
+    while len(tasks) != 0:
+        if len(tasks) > 5:
+            result += await gather(*tasks[:4])
+            tasks = tasks[4:]
+            await sleep(2)
+        else:
+            result += await gather(*tasks)
+            tasks = []
+    return result
+
+
+async def parse_page(discourse_page: int, session: ClientSession) -> None:
+    async with session.get(f"https://discuss.hail.is/latest.json?page={discourse_page}") as response:
+        return await response.json()
+
+
+async def parse_topic(topic_id: int, session: ClientSession) -> None:
+    async with session.get(f"https://discuss.hail.is/t/{topic_id}.json") as response:
+        return loads(await response.read())
+
+
+async def parse_post(post_id: int, session: ClientSession) -> None:
+    async with session.get(f"https://discuss.hail.is/posts/{post_id}.json") as response:
+        response_json = loads(await response.read())
+        return DiscoursePost(
+            response_json["id"],
+            response_json["topic_id"],
+            response_json["username"],
+            response_json["created_at"],
+            response_json["cooked"],
+        )
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--page")
+    args = parser.parse_args()
+    run(main(int(args.page)))
diff --git a/discourse-export-script/github_upload.py b/discourse-export-script/github_upload.py
new file mode 100644
index 00000000000..b11660fa338
--- /dev/null
+++ b/discourse-export-script/github_upload.py
@@ -0,0 +1,231 @@
+from argparse import ArgumentParser
+from asyncio import gather, run, sleep
+from datetime import datetime
+from json import dumps, load, loads
+from os import listdir
+from os.path import isfile, join
+from re import findall, sub
+from shutil import move
+
+from aiohttp import ClientSession
+
+now = datetime.now
+fromtimestamp = datetime.fromtimestamp
+
+# constants
+POST_LINK_ID = "f4706281-cc60-4ff0-a0b6-b803683cc24b"
+COMMENT_END_ID = "917f6034-2117-4a8c-bb42-b27fd7fb5e83"
+
+
+# TODO i think the numbering is off?
+async def main(github_issue_number: int, github_token: str) -> None:
+    links = {}
+    for idx, filename in enumerate(sorted(listdir("./discourse-export"))):
+        if isfile(join("./discourse-export", filename)):
+            id, rest = filename.split("_")
+            slug, _ = rest.split(".")
+            with open(f"./discourse-export/{id}_{slug}.json", "r") as file:
+                links[slug] = {
+                    "id": id,
+                    "idx": idx,
+                    "dests": set([slug for slug in findall(f'{POST_LINK_ID}/([A-Za-z0-9\\-]*?)\\\\"', file.read())]),
+                }
+    for slug, data in links.items():
+        for dest in data["dests"]:
+            dest_data = links.get(dest, None)
+            if dest_data is None:
+                print(
+                    f"broken link: {slug}->{dest} (https://github.com/hail-is/hail/issues/{github_issue_number + data['idx']})"
+                )
+            else:
+                print(
+                    f"link: {slug} (https://github.com/hail-is/hail/issues/{github_issue_number + data['idx']}) -> {dest} (https://github.com/hail-is/hail/issues/{github_issue_number + dest_data['idx']})"
+                )
+                with open(f"./discourse-export/{data['id']}_{slug}.json", "r") as file:
+                    json = sub(
+                        f'{POST_LINK_ID}/{dest}"',
+                        f"https://github.com/hail-is/hail/issues/{github_issue_number + dest_data['idx']}\\\"",
+                        file.read(),
+                    )
+                with open(f"./discourse-export/{data['id']}_{slug}.json", "w") as file:
+                    file.write(json)
+    async with ClientSession() as session:
+        for issue in sorted([{"slug": slug, **data} for slug, data in links.items()], key=lambda x: x["idx"]):
+            with open(f"./discourse-export/{issue['id']}_{issue['slug']}.json", "r") as file:
+                topic = load(file)
+            discussion_id, label_applied, comment_idx = [None, False, 0]
+            comments = topic["html"].split(COMMENT_END_ID)
+            discussion_html = comments[0]
+            rest_comments = comments[1:]
+            while discussion_id is None:
+                discussion_id = next(
+                    iter(await gather(create_discussion(discussion_html, topic["title"], session, github_token)))
+                )
+            while not label_applied:
+                label_applied = next(iter(await gather(apply_label(discussion_id, session, github_token))))
+            while comment_idx < (len(rest_comments)):
+                comment_idx = next(
+                    iter(
+                        await gather(
+                            add_comment(comment_idx, rest_comments[comment_idx], discussion_id, session, github_token)
+                        )
+                    )
+                )
+            move(
+                f"./discourse-export/{issue['id']}_{issue['slug']}.json",
+                f"./uploaded/{issue['id']}_{issue['slug']}.json",
+            )
+
+
+async def add_comment(comment_idx, comment_html, discussion_id, session, github_token):
+    comment_query = f"""
+    mutation {{
+      addDiscussionComment (
+        input: {{
+          discussionId: "{discussion_id}"
+          body: {dumps(comment_html)}
+        }}
+      ) {{
+        comment {{
+            id
+        }}
+      }}
+    }}
+    """
+    async with session.post(
+        "https://api.github.com/graphql",
+        json={"query": comment_query},
+        headers={
+            "Accept": "application/vnd.github+json",
+            "Authorization": f"Bearer {github_token}",
+            "Content-Type": "application/json; charset=utf-8",
+            "X-GitHub-Api-Version": "2022-11-28",
+        },
+    ) as comment_response:
+        comment_response_json = loads(await comment_response.read())
+        if comment_response_json.get("errors", None) is not None:
+            print(comment_response_json)
+            await handle_error(comment_response.headers)
+            return comment_idx
+    return comment_idx + 1
+
+
+async def apply_label(discussion_id, session, github_token):
+    label_query = f"""
+    mutation {{
+      addLabelsToLabelable (
+        input: {{
+          labelableId: "{discussion_id}"
+          labelIds: ["LA_kwDOKFqpFc8AAAABajc5aQ"]
+        }}
+      ) {{
+        labelable {{
+          labels {{
+            totalCount
+          }}
+        }}
+      }}
+    }}
+    """
+    async with session.post(
+        "https://api.github.com/graphql",
+        json={"query": label_query},
+        headers={
+            "Accept": "application/vnd.github+json",
+            "Authorization": f"Bearer {github_token}",
+            "Content-Type": "application/json; charset=utf-8",
+            "X-GitHub-Api-Version": "2022-11-28",
+        },
+    ) as label_response:
+        label_response_json = loads(await label_response.read())
+        if label_response_json.get("errors", None) is not None:
+            print(label_response_json)
+            await handle_error(label_response.headers)
+            return False
+    return True
+
+
+async def create_discussion(discussion_html, discussion_title, session: ClientSession, github_token: str) -> bool:
+    discussion_query = f"""
+    mutation {{
+      createDiscussion(
+        input: {{
+          repositoryId: "R_kgDOKFqpFQ",
+          categoryId: "DIC_kwDOKFqpFc4CYhFv",
+          body: {dumps(discussion_html)},
+          title: "{discussion_title}"
+        }}
+      ) {{
+        discussion {{
+          id
+        }}
+      }}
+    }}
+    """
+    async with session.post(
+        "https://api.github.com/graphql",
+        json={"query": discussion_query},
+        headers={
+            "Accept": "application/vnd.github+json",
+            "Authorization": f"Bearer {github_token}",
+            "Content-Type": "application/json; charset=utf-8",
+            "X-GitHub-Api-Version": "2022-11-28",
+        },
+    ) as discussion_response:
+        discussion_response_json = loads(await discussion_response.read())
+        if discussion_response_json.get("errors", None) is not None:
+            print(discussion_response_json)
+            await handle_error(discussion_response.headers)
+            return None
+        return discussion_response_json["data"]["createDiscussion"]["discussion"]["id"]
+
+
+async def handle_error(headers):
+    retry_time = fromtimestamp(int(headers.get("X-RateLimit-Reset")))
+    if retry_time > now():
+        print(f"Retry time is {retry_time - now()}; waiting for 1 minute...")
+        await sleep(60)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--github_issue_number")
+    parser.add_argument("--github_token")
+    args = parser.parse_args()
+    run(main(int(args.github_issue_number), args.github_token))
+
+
+# TODO change ids to match hail repo using following queries
+
+# query {
+#  repository (name: "test-process", owner: "iris-garden") {
+#    id
+#    name
+#  }
+# }
+
+# query {
+#  repository (name: "test-process", owner: "iris-garden") {
+#    discussionCategories (first: 100) {
+#      edges {
+#        node {
+#          name
+#          id
+#        }
+#      }
+#    }
+#  }
+# }
+
+# query {
+#  repository (name: "test-process", owner: "iris-garden") {
+#    labels (first: 100) {
+#      edges {
+#        node {
+#          id
+#          name
+#        }
+#      }
+#    }
+#  }
+# }
diff --git a/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker b/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker
index ca6ee9cea8e..1f0955d450f 100644
--- a/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker
+++ b/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker
@@ -1 +1 @@
-mock-maker-inline
\ No newline at end of file
+mock-maker-inline