-
Notifications
You must be signed in to change notification settings - Fork 123
/
Copy pathverify_doc_links.py
executable file
·186 lines (139 loc) · 6.59 KB
/
verify_doc_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
# Copyright 2020 kubeflow.org
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import concurrent.futures
import itertools
import re
import requests
from glob import glob
from os import environ as env
from os.path import abspath, basename, dirname, exists, join, relpath
from random import randint
from time import sleep
GITHUB_REPO = env.get("GITHUB_REPO", "https://github.com/kubeflow/kfp-tekton/")
md_file_path_expressions = [
"/guides/**/*.md",
"/samples/**/*.md",
"/sdk/**/*.md",
"/*.md"
]
script_folder = abspath(dirname(__file__))
project_root_dir = abspath(dirname(dirname(script_folder)))
github_repo_master_path = "{}/blob/master".format(GITHUB_REPO.rstrip("/"))
parallel_requests = 60 # GitHub rate limiting is 60 requests per minute, then we sleep a bit
url_status_cache = dict()
def find_md_files() -> [str]:
print("Checking for Markdown files here:\n")
for path_expr in md_file_path_expressions:
print(" " + path_expr.lstrip("/"))
print("")
md_files_list_of_lists = [glob(project_root_dir + path_expr, recursive=True)
for path_expr in md_file_path_expressions]
return sorted(list(itertools.chain(*md_files_list_of_lists)))
def get_links_from_md_file(md_file_path: str) -> [(int, str, str)]: # -> [(line, link_text, URL)]
with open(md_file_path, "r") as f:
md_file_content = f.read()
folder = relpath(dirname(md_file_path), project_root_dir)
# replace relative links that are siblings to the README, i.e. [link text](FEATURES.md)
md_file_content = re.sub(
r"\[([^]]+)\]\((?!http|#|/)([^)]+)\)",
r"[\1]({}/{}/\2)".format(github_repo_master_path, folder).replace("/./", "/"),
md_file_content)
# replace links that are relative to the project root, i.e. [link text](/sdk/FEATURES.md)
md_file_content = re.sub(
r"\[([^]]+)\]\(/([^)]+)\)",
r"[\1]({}/\2)".format(github_repo_master_path),
md_file_content)
# return completed links
line_text_url = []
for line_number, line_text in enumerate(md_file_content.splitlines()):
for (link_text, url) in re.findall(r"\[([^]]+)\]\((%s[^)]+)\)" % "http", line_text):
line_text_url.append((line_number + 1, link_text, url))
return line_text_url
def test_url(file: str, line: int, text: str, url: str) -> (str, int, str, str, int): # (file, line, text, url, status)
short_url = url.split("#", maxsplit=1)[0]
if short_url not in url_status_cache:
# mind GitHub rate limiting, use local files to verify link
if short_url.startswith(github_repo_master_path):
local_path = short_url.replace(github_repo_master_path, "")
if exists(abspath(project_root_dir + local_path)):
status = 200
else:
status = 404
else:
try:
status = requests.head(short_url, allow_redirects=True, timeout=5).status_code
if status == 405: # method not allowed, use GET instead of HEAD
status = requests.get(short_url, allow_redirects=True, timeout=5).status_code
if status == 429: # GitHub rate limiting, try again after 1 minute
sleep(randint(60, 90))
status = requests.head(short_url, allow_redirects=True, timeout=5).status_code
except requests.exceptions.Timeout as e:
status = 408
except requests.exceptions.RequestException as e:
status = 500
url_status_cache[short_url] = status
status = url_status_cache[short_url]
return file, line, text, url, status
def verify_urls_concurrently(file_line_text_url: [(str, int, str, str)]) -> [(str, int, str, str)]:
file_line_text_url_status = []
with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_requests) as executor:
check_urls = (
executor.submit(test_url, file, line, text, url)
for (file, line, text, url) in file_line_text_url
)
for url_check in concurrent.futures.as_completed(check_urls):
try:
file, line, text, url, status = url_check.result()
file_line_text_url_status.append((file, line, text, url, status))
except Exception as e:
print(str(type(e)))
file_line_text_url_status.append((file, line, text, url, 500))
finally:
print("{}/{}".format(len(file_line_text_url_status),
len(file_line_text_url)), end="\r")
return file_line_text_url_status
def verify_doc_links() -> [(str, int, str, str)]:
# 1. find all relevant Markdown files
md_file_paths = find_md_files()
# 2. extract all links with text and URL
file_line_text_url = [
(file, line, text, url)
for file in md_file_paths
for (line, text, url) in get_links_from_md_file(file)
]
# 3. validate the URLs
file_line_text_url_status = verify_urls_concurrently(file_line_text_url)
# 4. filter for the invalid URLs (status 404: "Not Found") to be reported
file_line_text_url_404 = [(f, l, t, u, s)
for (f, l, t, u, s) in file_line_text_url_status
if s == 404]
# 5. print some stats for confidence
print("{} {} links ({} unique URLs) in {} Markdown files.\n".format(
"Checked" if file_line_text_url_404 else "Verified",
len(file_line_text_url_status),
len(url_status_cache),
len(md_file_paths)))
# 6. report invalid links, exit with error for CI/CD
if file_line_text_url_404:
for (file, line, text, url, status) in file_line_text_url_404:
print("{}:{}: `[{}]({})` {}".format(
relpath(file, project_root_dir), line, text,
url.replace(github_repo_master_path, ""), status))
# print a summary line for clear error discovery at the bottom of Travis job log
print("\nERROR: Found {} invalid Markdown links".format(
len(file_line_text_url_404)))
exit(1)
if __name__ == '__main__':
verify_doc_links()