Skip to content

Commit

Permalink
feat: get repos from graphql (#120)
Browse files Browse the repository at this point in the history
* feat: get repos from graphql

context: codecov/engineering-team#971

Adds function to get repositories from Github's GraphQL api.

It uses node_id to get the specified node from gh. This can be found in
the INSTALLATION webhook that we will process.
Along with it there will be the databaseId (service_id).

Adding the `expected_owner` entry because when processing an installation webhook
we do know what the expected owner is. Fetching the owner would require another request,
but can be done (again by node_id)

The idea is to use this function when syncing repos if we know what the repos
covered by an installation are.

* add owner lookup and pagination

There's been question if the nodes query has pagination.
So we don't trip up unecessarilly I decided to implement simple pagination ourselves.
I made page_size 100 as github docs indicate that that's the max number of records
you can get at once from endpoints that use `first` or `last` pagination.
100 repos is a lot of repos.

I was also not sure if _all_ repos will actually belong to the same owner.
Just in case I decided to add owner lookup as well. Notice it does 1 extra request
per unique owner that we encounter.
We can use this info to upsert owners if needed.
  • Loading branch information
giovanni-guidini committed Feb 8, 2024
1 parent 1afb5ad commit d887db5
Show file tree
Hide file tree
Showing 4 changed files with 435 additions and 3 deletions.
140 changes: 137 additions & 3 deletions shared/torngit/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@

METRICS_PREFIX = "services.torngit.github"

GITHUB_REPO_COUNT_QUERY = """

class GitHubGraphQLQueries(object):
_queries = dict(
REPO_TOTALCOUNT="""
query {
viewer {
repositories(
Expand All @@ -47,11 +50,61 @@
}
}
}
"""
""",
REPOS_FROM_NODEIDS="""
query GetReposFromNodeIds($node_ids: [ID!]!) {
nodes(ids: $node_ids) {
__typename
... on Repository {
# databaseId == service_id
databaseId
name
primaryLanguage {
name
}
isPrivate
defaultBranchRef {
name
}
owner {
# This ID is actually the node_id, not the ownerid
id
login
}
}
}
}
""",
OWNER_FROM_NODEID="""
query GetOwnerFromNodeId($node_id: ID!) {
node(id: $node_id) {
__typename
... on Organization {
login
databaseId
}
... on User {
login
databaseId
}
}
}
""",
)

def get(self, query_name: str) -> Optional[str]:
return self._queries.get(query_name, None)

def prepare(self, query_name: str, variables: dict) -> Optional[dict]:
# If Query was an object we could validate the variables
query = self.get(query_name)
if query is not None:
return {"query": query, "variables": variables}


class Github(TorngitBaseAdapter):
service = "github"
graphql = GitHubGraphQLQueries()
urls = dict(
repo="{username}/{name}",
owner="{username}",
Expand Down Expand Up @@ -587,11 +640,92 @@ async def _fetch_number_of_repos(self, client, token):
client,
"post",
"/graphql",
body=dict(query=GITHUB_REPO_COUNT_QUERY),
body=dict(query=self.graphql.get("REPO_TOTALCOUNT")),
token=token,
)
return res["data"]["viewer"]["repositories"]["totalCount"]

async def _get_owner_from_nodeid(self, client, token, owner_node_id: str):
query = self.graphql.prepare(
"OWNER_FROM_NODEID", variables={"node_id": owner_node_id}
)
res = await self.api(
client,
"post",
"/graphql",
body=query,
token=token,
)
owner_data = res["data"]["node"]
return {"username": owner_data["login"], "service_id": owner_data["databaseId"]}

async def get_repos_from_nodeids_generator(
self, repo_node_ids: List[str], expected_owner_username, *, token=None
):
"""Gets a list of repos from github graphQL API when the node_ids for the repos are known.
Also gets the owner info (also from graphQL API) if the owner is not the expected one.
The expected owner is one we are sure to have the info for available.
Couldn't find how to use pagination with this endpoint, so we will implement it ourselves
believing that the max number of node_ids we can use is 100.
"""
token = self.get_token_by_type_if_none(token, TokenType.read)
owners_seen = dict()
async with self.get_client() as client:
max_index = len(repo_node_ids)
curr_index = 0
PAGE_SIZE = 100
while curr_index < max_index:
chunk = repo_node_ids[curr_index : curr_index + PAGE_SIZE]
curr_index += PAGE_SIZE
query = self.graphql.prepare(
"REPOS_FROM_NODEIDS", variables={"node_ids": chunk}
)
res = await self.api(
client,
"post",
"/graphql",
body=query,
token=token,
)
for raw_repo_data in res["data"]["nodes"]:
if (
raw_repo_data is None
or raw_repo_data["__typename"] != "Repository"
):
continue
primary_language = raw_repo_data.get("primaryLanguage")
default_branch = raw_repo_data.get("defaultBranchRef")
repo = {
"service_id": raw_repo_data["databaseId"],
"name": raw_repo_data["name"],
"language": self._validate_language(
primary_language.get("name") if primary_language else None
),
"private": raw_repo_data["isPrivate"],
"branch": default_branch.get("name")
if default_branch
else None,
"owner": {
"node_id": raw_repo_data["owner"]["id"],
"username": raw_repo_data["owner"]["login"],
},
}
is_expected_owner = (
repo["owner"]["username"] == expected_owner_username
)
if not is_expected_owner:
ownerid = repo["owner"]["node_id"]
if ownerid not in owners_seen:
owner_info = await self._get_owner_from_nodeid(
client, token, ownerid
)
owners_seen[ownerid] = owner_info
repo["owner"] = {**repo["owner"], **owners_seen[ownerid]}

repo["owner"]["is_expected_owner"] = is_expected_owner
yield repo

async def list_repos_using_installation(self, username=None):
"""
returns list of repositories included in this integration
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
interactions:
- request:
body: '{"query": "\nquery GetReposFromNodeIds($node_ids: [ID!]!) {\n nodes(ids:
$node_ids) {\n __typename \n ... on Repository {\n #
databaseId == service_id\n databaseId\n name\n primaryLanguage
{\n name\n }\n isPrivate\n defaultBranchRef
{\n name\n }\n owner {\n #
This ID is actually the node_id, not the ownerid\n id\n login\n }\n }\n }\n}\n",
"variables": {"node_ids": ["R_kgDOHrbKcg", "R_kgDOLEJx2g"]}}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '613'
content-type:
- application/json
host:
- api.github.com
user-agent:
- Default
method: POST
uri: https://api.github.com/graphql
response:
content: '{"data":{"nodes":[{"__typename":"Repository","databaseId":515295858,"name":"example-python","primaryLanguage":{"name":"Shell"},"isPrivate":false,"defaultBranchRef":{"name":"master"},"owner":{"id":"U_kgDOBZOfKw","login":"codecove2e"}},{"__typename":"Repository","databaseId":742552026,"name":"test-no-languages","primaryLanguage":null,"isPrivate":false,"defaultBranchRef":null,"owner":{"id":"U_kgDOBZOfKw","login":"codecove2e"}}]}}'
headers:
Access-Control-Allow-Origin:
- '*'
Access-Control-Expose-Headers:
- ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining,
X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes,
X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO,
X-GitHub-Request-Id, Deprecation, Sunset
Content-Encoding:
- gzip
Content-Security-Policy:
- default-src 'none'
Content-Type:
- application/json; charset=utf-8
Date:
- Tue, 06 Feb 2024 13:21:07 GMT
Referrer-Policy:
- origin-when-cross-origin, strict-origin-when-cross-origin
Server:
- GitHub.com
Strict-Transport-Security:
- max-age=31536000; includeSubdomains; preload
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding, Accept, X-Requested-With
X-Accepted-OAuth-Scopes:
- repo
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- deny
X-GitHub-Media-Type:
- github.v4
X-GitHub-Request-Id:
- C11E:116D76:8B8D4:94D71:65C23242
X-OAuth-Scopes:
- repo
X-RateLimit-Limit:
- '5000'
X-RateLimit-Remaining:
- '4997'
X-RateLimit-Reset:
- '1707227531'
X-RateLimit-Resource:
- graphql
X-RateLimit-Used:
- '3'
X-XSS-Protection:
- '0'
http_version: HTTP/1.1
status_code: 200
version: 1
36 changes: 36 additions & 0 deletions tests/integration/test_github.py
Original file line number Diff line number Diff line change
Expand Up @@ -1815,3 +1815,39 @@ async def test_list_github_app_webhook_redelivery(self, codecov_vcr):
)
res = await ghapp_handler.request_webhook_redelivery(17322555251)
assert res is True

@pytest.mark.asyncio
async def test_get_repos_from_nodeids_generator(self, valid_handler, codecov_vcr):
repo_node_ids = ["R_kgDOHrbKcg", "R_kgDOLEJx2g"]
expected = [
{
"service_id": 515295858,
"name": "example-python",
"language": "shell",
"private": False,
"branch": "master",
"owner": {
"node_id": "U_kgDOBZOfKw",
"username": "codecove2e",
"is_expected_owner": True,
},
},
{
"service_id": 742552026,
"name": "test-no-languages",
"language": None,
"private": False,
"branch": None,
"owner": {
"node_id": "U_kgDOBZOfKw",
"username": "codecove2e",
"is_expected_owner": True,
},
},
]
received = []
async for repo in valid_handler.get_repos_from_nodeids_generator(
repo_node_ids, "codecove2e"
):
received.append(repo)
assert received == expected
Loading

0 comments on commit d887db5

Please sign in to comment.