Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Add heuristics for issue tracker #12703

Merged
merged 17 commits into from
May 22, 2024
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ on:
# * is a special character in YAML so you have to quote this string
# Run this Github Action every Tuesday at 7 AM UTC
- cron: '0 7 * * 2'
workflow_dispatch:

permissions: read-all

Expand Down
184 changes: 155 additions & 29 deletions scripts/updateapilist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@

import os
import requests
import logging
from typing import List, Optional
from dataclasses import dataclass

# Configure logging to output messages to console
logging.basicConfig(level=logging.INFO) # Set the desired logging level

import re

class MissingGithubToken(ValueError):
"""Raised when the GITHUB_TOKEN environment variable is not set"""
Expand Down Expand Up @@ -57,9 +62,20 @@ class MissingGithubToken(ValueError):
# BASE_API defines the base API for Github.
BASE_API = "https://api.github.com"

# GITHUB_ISSUES defines the issues URL for a repository on GitHub.
GITHUB_ISSUES = "https://github.com/{repo}/issues"

# BASE_ISSUE_TRACKER defines the base URL for issue tracker.
BASE_ISSUE_TRACKER = "https://issuetracker.google.com"

# This issue-tracker component is part of some saved searches for listing API-side issues.
# However, when we construct URLs for filing new issues (which in some cases we do by analyzing
# the query string for a saved search), we want to ensure we DON'T file a new issue against
# this generic component but against a more specific one.
GENERIC_ISSUE_TRACKER_COMPONENT = "187065"

# This sentinel value is used to mark cache fields that have not been computed yet.
NOT_COMPUTED = -1

class CloudClient:
repo: str = None
Expand All @@ -68,13 +84,97 @@ class CloudClient:
distribution_name: str = None
issue_tracker: str = None


def __init__(self, repo: dict):
self.repo = repo["repo"]
# For now, strip out "Google Cloud" to standardize the titles
self.title = repo["name_pretty"].replace("Google ", "").replace("Cloud ", "")
self.release_level = repo["release_level"]
self.distribution_name = repo["distribution_name"]
self.issue_tracker = repo.get("issue_tracker")
self._cached_component_id = NOT_COMPUTED
self._cached_template_id = NOT_COMPUTED
self._cached_saved_search_id = NOT_COMPUTED

@property
def saved_search_id(self):
if self._cached_saved_search_id != NOT_COMPUTED:
return self._cached_saved_search_id
if not self.issue_tracker:
self._cached_saved_search_id = None
else:
match = re.search(r'savedsearches/(\d+)', self.issue_tracker)
self._cached_saved_search_id = match.group(1) if match else None
return self._cached_saved_search_id

@property
def saved_search_response_text(self):
vchudnov-g marked this conversation as resolved.
Show resolved Hide resolved
if not self.saved_search_id:
return None
url = f"{BASE_ISSUE_TRACKER}/action/saved_searches/{self.saved_search_id}"
response = _fetch_response(url)
return response.text if response else None

@property
def issue_tracker_component_id(self):
if self._cached_component_id != NOT_COMPUTED:
return self._cached_component_id

# First, check if the issue tracker is a saved search:
query_string = self.saved_search_response_text or self.issue_tracker
if not query_string:
self._cached_component_id = None
else:
# Try to match 'component=' in the query string
query_match = re.search(r'\bcomponent=(\d+)', query_string)
if query_match:
self._cached_component_id = query_match.group(1)
else:
# If not found, try to match 'componentid:' in the query string
query_match = re.findall(r'\bcomponentid:(\d+)', query_string)
for component_id in query_match:
if component_id == GENERIC_ISSUE_TRACKER_COMPONENT:
continue
if self._cached_component_id != NOT_COMPUTED:
self._cached_component_id = None
logging.error(f"More than one component ID found for issue tracker: {self.issue_tracker}")
break
self._cached_component_id = component_id
self._cached_component_id = self._cached_component_id if self._cached_component_id != NOT_COMPUTED else None
return self._cached_component_id

@property
def issue_tracker_template_id(self):
vchudnov-g marked this conversation as resolved.
Show resolved Hide resolved
if self._cached_template_id != NOT_COMPUTED:
return self._cached_template_id
if not self.issue_tracker:
self._cached_template_id = None
else:
match = re.search(r'(?:\?|&)template=(\d+)', self.issue_tracker)
self._cached_template_id = match.group(1) if match else None
return self._cached_template_id

@property
def show_client_issues(self):
return GITHUB_ISSUES.format(repo=self.repo)

@property
def file_api_issue(self):
if self.issue_tracker_component_id:
link = f"{BASE_ISSUE_TRACKER}/issues/new?component={self.issue_tracker_component_id}"
if self.issue_tracker_template_id:
link += f"&template={self.issue_tracker_template_id}"
return link
return None

@property
def show_api_issues(self):
vchudnov-g marked this conversation as resolved.
Show resolved Hide resolved
if self.saved_search_id:
# Return the original issue_tracker content, which already links to the saved search.
return self.issue_tracker
elif self.issue_tracker_component_id:
return f"{BASE_ISSUE_TRACKER}/issues?q=componentid:{self.issue_tracker_component_id}"
return None

# For sorting, we want to sort by release level, then API pretty_name
def __lt__(self, other):
Expand All @@ -95,15 +195,35 @@ class Extractor:
def client_for_repo(self, repo_slug) -> Optional[CloudClient]:
path = self.path_format.format(repo_slug=repo_slug)
url = f"{RAW_CONTENT_BASE_URL}/{path}/{REPO_METADATA_FILENAME}"
response = requests.get(url)
if response.status_code != requests.codes.ok:
return

return CloudClient(response.json())
_, metadata = _fetch_and_parse_response(url)
if not metadata:
return None
return CloudClient(metadata)

def get_clients_from_batch_response(self, response_json) -> List[CloudClient]:
return [self.client_for_repo(repo[self.response_key]) for repo in response_json if allowed_repo(repo)]

def _fetch_response(url: str, headers:dict = None, params:Optional[dict] = None) -> Optional[requests.Response]:
try:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
return response
except requests.RequestException as e:
logging.error(f"Request failed for URL {url}: {e}")
return None

def _parse_response(response: requests.Response) -> Optional[dict]:
try:
return response.json()
except ValueError as e:
logging.error(f"JSON decoding failed for URL {response.url}: {e}")
return None

def _fetch_and_parse_response(url: str, headers:dict = None, params:Optional[dict] = None):
response = _fetch_response(url, headers, params)
if not response:
return None, None
return response, _parse_response(response)

def replace_content_in_readme(content_rows: List[str]) -> None:
START_MARKER = ".. API_TABLE_START"
Expand Down Expand Up @@ -135,16 +255,17 @@ def client_row(client: CloudClient) -> str:
url = f"https://github.com/{client.repo}"
if client.repo == MONO_REPO:
url += f"/tree/main/packages/{client.distribution_name}"

_show_api_issues = client.show_api_issues
_file_api_issue = client.file_api_issue
Comment on lines +258 to +259
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(No action needed--just a comment)
You could also memoize the two properties on the RHS. The complication is that for this application, you need two pieces of information for the memoization (as you implemented above), and that's because the cached value could be None. In many applications, the memoized value is not None, so the value None can stand for "not yet cached". So I don't think it's worth pursuing here, because it adds two fields to the class, and this is certainly clear enough.

content_row = [
f" * - `{client.title} <{url}>`_\n",
f" - " + client.release_level + "\n",
f" - |PyPI-{client.distribution_name}|\n",
f" - {client.release_level}\n",
f" - |PyPI-{client.distribution_name}|\n",
f" - `API Issues <{_show_api_issues}>`_\n" if _show_api_issues else " -\n",
f" - `File an API Issue <{_file_api_issue}>`_\n" if _file_api_issue else " -\n",
f" - `Client Library Issues <{client.show_client_issues}>`_\n"
]

if client.issue_tracker:
content_row.append(f" - `API Issues <{client.issue_tracker}>`_\n")

return (content_row, pypi_badge)


Expand All @@ -157,7 +278,9 @@ def generate_table_contents(clients: List[CloudClient]) -> List[str]:
" * - Client\n",
" - Release Level\n",
" - Version\n",
" - API Issue Tracker\n",
" - API Issues\n",
" - File an API Issue\n",
" - Client Library Issues\n",
]

pypi_links = ["\n"]
Expand All @@ -181,30 +304,33 @@ def mono_repo_clients(token: str) -> List[CloudClient]:
# all mono repo clients
url = f"{BASE_API}/repos/{MONO_REPO}/contents/packages"
headers = {'Authorization': f'token {token}'}
response = requests.get(url=url, headers=headers)
_, packages = _fetch_and_parse_response(url, headers)
if not packages:
return []
mono_repo_extractor = Extractor(path_format=MONO_REPO_PATH_FORMAT, response_key=PACKAGE_RESPONSE_KEY)

return mono_repo_extractor.get_clients_from_batch_response(response.json())
return mono_repo_extractor.get_clients_from_batch_response(packages)


def split_repo_clients(token: str) -> List[CloudClient]:

first_request = True
while first_request or 'next' in response.links:
if first_request:
url = f"{BASE_API}/search/repositories?page=1"
first_request = False
else:
url = response.links['next']['url']
headers = {'Authorization': f'token {token}'}
params = {'per_page': 100, "q": "python- in:name org:googleapis"}
response = requests.get(url=url, params=params, headers=headers)
repositories = response.json().get("items", [])
clients = []
url = f"{BASE_API}/search/repositories?page=1"
headers = {'Authorization': f'token {token}'}
params = {'per_page': 100, "q": "python- in:name org:googleapis"}

while url:
response, metadata = _fetch_and_parse_response(url, headers, params)
if not metadata:
break
repositories = metadata.get("items", [])
if len(repositories) == 0:
break

split_repo_extractor = Extractor(path_format=SPLIT_REPO_PATH_FORMAT, response_key=REPO_RESPONSE_KEY)
return split_repo_extractor.get_clients_from_batch_response(repositories)
clients.extend(split_repo_extractor.get_clients_from_batch_response(repositories))

# Check for the 'next' link in the response headers for pagination
vchudnov-g marked this conversation as resolved.
Show resolved Hide resolved
url = response.links.get('next', {}).get('url')

return clients


def get_token():
Expand Down
Loading