diff --git a/notifications/notifications.py b/notifications/notifications.py index d969de37..020eb677 100644 --- a/notifications/notifications.py +++ b/notifications/notifications.py @@ -12,6 +12,9 @@ TOKEN_NAME = "GITHUB_TOKEN" PULL_REQUEST_TYPE = "PullRequest" +# TODO(jlewi): Rewrite this code to use: +# i) graphql.unpack_and_split_nodes +# ii) graphql.shard_writer def process_notification(n): # Mark as read anything that isn't an explicit mention. # For PRs there doesn't seem like a simple way to detect if the notice diff --git a/py/code_intelligence/graphql.py b/py/code_intelligence/graphql.py index 28458ba7..481e7aea 100644 --- a/py/code_intelligence/graphql.py +++ b/py/code_intelligence/graphql.py @@ -1,5 +1,6 @@ """This module contains utilities for working with GitHub's graphql API""" - +from code_intelligence import util +import logging import os import requests @@ -12,12 +13,71 @@ def __init__(self): self._headers = {"Authorization": "Bearer {0}".format(os.getenv("GITHUB_TOKEN"))} - def run_query(self, query): - """Issue the GraphQL query and return the results.""" + def run_query(self, query, variables=None): + """Issue the GraphQL query and return the results. + + Args: + query: String containing the query + variables: Dictionary of variables + """ + payload = {'query': query} + + if variables: + payload["variables"] = variables + request = requests.post('https://api.github.com/graphql', - json={'query': query}, headers=self._headers) + json=payload, headers=self._headers) if request.status_code == 200: return request.json() else: raise Exception("Query failed to run by returning code of {}. {}".format( - request.status_code, query)) \ No newline at end of file + request.status_code, query)) + +def unpack_and_split_nodes(data, path): + """Unpack a list of results + + Args: + data: A dictionary containing the results + path: A list of fields indicating the fields to select. + final one should be a list of nodes + + Returns: + issues: A list of dicts; each dict is the data for some of + the results + """ + + children = [data] + + for i, f in enumerate(path): + last_child = children[-1] + if not f in last_child: + # If there are no edges then the field will not exist + return [] + children.append(last_child.get(f)) + + + child = children[-1] + + items = [] + for i in child: + items.append(i["node"]) + + return items + +class ShardWriter(object): + """Write items as a set of file shards""" + + def __init__(self, total_shards, output_dir, prefix="items"): + self.output_dir = output_dir + self.total_shards = total_shards + self.shard = 0 + self.prefix = prefix + + def write_shard(self, items): + """Write the shard""" + shard_file = os.path.join( + self.output_dir, + self.prefix + "-{0:03d}-of-{1:03d}.json".format( + self.shard, self.total_shards)) + util.write_items_to_json(shard_file, items) + self.shard += 1 \ No newline at end of file diff --git a/py/code_intelligence/test_data/issues_for_triage.json b/py/code_intelligence/test_data/issues_for_triage.json new file mode 100644 index 00000000..f17087dd --- /dev/null +++ b/py/code_intelligence/test_data/issues_for_triage.json @@ -0,0 +1,3 @@ +{"author": {"__typename": "User", "login": "jlewi"}, "title": "Doesn't need triage", "body": "", "url": "https://github.com/kubeflow/kubeflow/issues/365", "labels": {"totalCount": 2, "edges": [{"node": {"name": "kind/bug"}}, {"node": {"name": "area/jupyter"}}, {"node": {"name": "priority/p1"}}]}, "projectCards": {"totalCount": 1, "edges": [{"node": {"project": {"name": "0.2 Release", "number": 2}}}]}} +{"author": {"__typename": "User", "login": "jlewi"}, "title": "Needs triage; missing everything", "body": "", "url": "https://github.com/kubeflow/kubeflow/issues/365", "labels": {"totalCount": 2, "edges": []}, "projectCards": {"totalCount": 1, "edges": []}} +{"author": {"__typename": "User", "login": "jlewi"}, "title": "Needs triage; missing projectCards", "body": "", "url": "https://github.com/kubeflow/kubeflow/issues/365", "labels": {"totalCount": 2, "edges": []}} \ No newline at end of file diff --git a/py/code_intelligence/triage.py b/py/code_intelligence/triage.py new file mode 100644 index 00000000..4cfab9b5 --- /dev/null +++ b/py/code_intelligence/triage.py @@ -0,0 +1,342 @@ +"""Identify issues that need triage.""" +from code_intelligence import graphql +from code_intelligence import util +import fire +import github3 +import json +import logging +import os +import numpy as np +import pprint +import retrying +import json + +TOKEN_NAME = "GITHUB_TOKEN" + +# TODO(jlewi): If we make this an app maybe we should read this from a .github +#file +ALLOWED_KINDS = ["improvement/enhancement", "community/question", "kind/bug"] +ALLOWED_PRIORITY = ["priority/p0", "priority/p1", "priority/p2", + "priority/p3"] + +REQUIRES_PROJECT = ["priority/p0", "priority/p1"] + +class TriageInfo(object): + """Class describing whether an issue needs triage""" + def __init__(self): + # Booleans indicating why triage might fail + self.missing_kind = True + self.missing_priority = True + self.missing_project = True + self.missing_area = True + + @classmethod + def from_issue(cls, issue): + """Construct TriageInfo from the supplied issue""" + info = TriageInfo() + + labels = graphql.unpack_and_split_nodes(issue, ["labels", "edges"]) + + project_cards = graphql.unpack_and_split_nodes(issue, + ["projectCards", "edges"]) + for l in labels: + name = l["name"] + + if name in ALLOWED_KINDS: + info.missing_kind = False + + if name in ALLOWED_PRIORITY: + info.missing_priority = False + + if not name in REQUIRES_PROJECT: + info.missing_project = False + else: + if project_cards: + info.missing_project = False + + if name.startswith("area"): + info.missing_area= False + + return info + + def __eq__(self, other): + if self.missing_kind != other.missing_kind: + return False + + if self.missing_priority != other.missing_priority: + return False + + if self.missing_project != other.missing_project: + return False + + if self.missing_area != other.missing_area: + return False + + return True + + @property + def needs_triage(self): + """Return true if the issue needs triage""" + needs = False + + if self.missing_kind: + needs = True + + if self.missing_priority: + needs = True + + if self.missing_project: + needs = True + + if self.missing_area: + needs = True + + return needs + + def __repr__(self): + pieces = ["needs_triage={0}".format(self.needs_triage)] + + if self.needs_triage: + for f in ["missing_kind", "missing_area", "missing_priority", + "missing_project"]: + v = getattr(self, f) + pieces.append("{0}={1}".format(f, v)) + + return ";".join(pieces) + + def message(self): + """Return a human readable message.""" + if not self.needs_triage: + return "Issue doesn't need attention." + + lines = [] + if self.needs_triage: + lines.append("Issue needs triage:") + + if self.missing_kind: + lines.append("\t Issue needs one of the labels {0}".format(ALLOWED_KINDS)) + + if self.missing_priority: + lines.append("\t Issue needs one of the priorities {0}".format(ALLOWED_PRIORITY)) + + if self.missing_area: + lines.append("\t Issue needs an area label") + + if self.missing_project: + lines.append("\t Issues with priority in {0} need to be assigned to a project".format(REQUIRES_PROJECT)) + + return "\n".join(lines) + +class IssueTriage(object): + def mark_read(self, user): + token = os.getenv(TOKEN_NAME) + if not token: + raise ValueError(("Environment variable {0} needs to be set to a GitHub " + "token.").format(token)) + client = github3.GitHub(username=user, token=token) + notifications = client.notifications() + + # https://developer.github.com/v3/activity/notifications/ + # + # How do we identify closed pull requests? + for n in notifications: + process_notification(n) + + + def _fetch_issues(self, org, repo, output=None): + """Fetch issues for a repository + + Args: + org: The org that owns the repository + repo: The directory for the repository + output: The directory to write the results; if not specified results + are not downloaded + + Writes the issues along with the first comments to a file in output + directory. + """ + client = graphql.GraphQLClient() + + num_issues_per_page = 100 + + # TODO(jlewi):Use query variables + query_template = """{{ +repository(owner: "{org}", name: "{repo}") {{ + issues(first:{num_issues_per_page} {issues_cursor}) {{ + totalCount + pageInfo {{ + endCursor + hasNextPage + }} + edges{{ + node {{ + author {{ + __typename + ... on User {{ + login + }} + + ... on Bot{{ + login + }} + }} + title + body + url + labels(first:30, ){{ + totalCount + edges {{ + node {{ + name + }} + }} + }} + projectCards(first:30, ){{ + totalCount + edges {{ + node {{ + project {{ + name + number + }} + }} + }} + }} + }} + }} + }} +}} +}} +""" + + + shard = 0 + num_pages = None + if not os.path.exists(output): + os.makedirs(output) + + total_issues = None + has_next_issues_page = True + # TODO(jlewi): We should persist the cursors to disk so we can resume + # after errors + issues_cursor = None + shard_writer = None + while has_next_issues_page: + issues_cursor_text = "" + if issues_cursor: + issues_cursor_text = "after:\"{0}\"".format(issues_cursor) + query = query_template.format(org=org, repo=repo, + num_issues_per_page=num_issues_per_page, + issues_cursor=issues_cursor_text) + results = client.run_query(query) + + if results.get("errors"): + message = json.dumps(results.get("errors")) + logging.error("There was a problem issuing the query; errors:\n%s", + "\n".join(message)) + return + + if not total_issues: + total_issues = results["data"]["repository"]["issues"]["totalCount"] + num_pages = int(np.ceil(total_issues/float(num_issues_per_page))) + logging.info("%s/%s has a total of %s issues", org, repo, total_issues) + + if output and not shard_writer: + logging.info("initializing the shard writer") + shard_writer = graphql.ShardWriter(num_pages, output, + prefix="issues-{0}-{1}".format(org, repo)) + + issues = graphql.unpack_and_split_nodes( + results, ["data", "repository", "issues", "edges"]) + + if shard_writer: + shard_writer.write_shard(issues) + + page_info = results["data"]["repository"]["issues"]["pageInfo"] + issues_cursor = page_info["endCursor"] + has_next_issues_page = page_info["hasNextPage"] + + def _issue_needs_triage(self, issue): + """Check if the supplied issue needs triage. + + Args: + issue: json dictionary describing the issue + + Returns: + triage_info: Instance of TriageInfo explaining whether the issue needs + triage + """ + + def triage(self, repo, output=None): + """Triage issues in the specified repository. + + Args: + repo: Repository in the form {org}/{repo} + output: (Optional) directory to write issues + """ + org, repo_name = repo.split("/") + + self._fetch_issues(org, repo_name, output=output) + + + def triage_issue(self, url): + """Triage a single issue. + + Args: + url: The url of the issue e.g. + https://github.com/kubeflow/community/issues/280 + """ + + client = graphql.GraphQLClient() + + query = """query getIssue($url: URI!) { + resource(url: $url) { + __typename + ... on Issue { + author { + __typename + ... on User { + login + } + ... on Bot { + login + } + } + title + body + url + labels(first: 30) { + totalCount + edges { + node { + name + } + } + } + } + } +}""" + + variables = { + "url": url + } + results = client.run_query(query, variables=variables) + + if results.get("errors"): + message = json.dumps(results.get("errors")) + logging.error("There was a problem issuing the query; errors:\n%s", + "\n".join(message)) + return + + issue = results["data"]["resource"] + + info = TriageInfo.from_issue(issue) + logging.info("Issue %s:\nstate:%s\n", url, info.message()) +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(message)s|%(pathname)s|%(lineno)d|'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + + fire.Fire(IssueTriage) diff --git a/py/code_intelligence/triage_test.py b/py/code_intelligence/triage_test.py new file mode 100644 index 00000000..cdd312d4 --- /dev/null +++ b/py/code_intelligence/triage_test.py @@ -0,0 +1,53 @@ +from code_intelligence import triage +import json +import logging +import pytest + +def build_info(missing_kind, missing_priority, missing_area, missing_project): + info = triage.TriageInfo() + info.missing_kind = missing_kind + info.missing_area = missing_area + info.missing_priority = missing_priority + info.missing_project = missing_project + return info + +def test_triage_info(): + expected = [ + build_info(False, False, False, False), + build_info(True, True, True, True), + build_info(True, True, True, True), + ] + + actual = [] + + with open("test_data/issues_for_triage.json") as hf: + lines = hf.readlines() + + for i, l in enumerate(lines): + issue = json.loads(l) + + a = triage.TriageInfo.from_issue(issue) + actual.append(a) + assert len(expected) == len(actual) + + for i in range(len(expected)): + e = expected[i] + a = actual[i] + assert e == a + + if i == 0: + assert not e.needs_triage + else: + assert e.needs_triage + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + + pytest.main() \ No newline at end of file diff --git a/py/code_intelligence/util.py b/py/code_intelligence/util.py new file mode 100644 index 00000000..78844b6b --- /dev/null +++ b/py/code_intelligence/util.py @@ -0,0 +1,9 @@ +import logging +import json + +def write_items_to_json(output_file, results): + with open(output_file, "w") as hf: + for i in results: + json.dump(i, hf) + hf.write("\n") + logging.info("Wrote %s items to %s", len(results), output_file) \ No newline at end of file