forked from sdgilley/learn-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind-snippets.py
123 lines (107 loc) · 4.77 KB
/
find-snippets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'''
This script reads through the files in azure-docs (main) and finds code snippets from azureml-examples
It creates two files:
* refs-found.csv - needed for the merge-report and pr-report scripts
* CODEOWNERS.txt - use the contents to populate the CODEOWNERS file in azureml-examples
Run this script periodically to stay up to date with the latest references.
'''
import os
import re
import sys
import utilities as h
import gh_auth as a
import pandas as pd
from datetime import datetime
###################### INPUT HERE ############################
# Name the path to your repo. If trying to use a private repo, you'll need a token that has access to it.
repo_name = "MicrosoftDocs/azure-docs"
repo_branch = "main"
path_in_repo = 'articles/machine-learning'
############################ DONE ############################
# Name the file to write the results to. Don't change this, report-pr.py needs this file to work.
script_dir = os.path.dirname(os.path.realpath(__file__))
result_fn = os.path.join(script_dir,"refs-found.csv")
tutorials_fn = os.path.join(script_dir,"tutorials.csv")
az_ml_branch = "azureml-examples-main"
found = pd.DataFrame(columns=['ref_file', 'from_file'])
dict_list = []
dict_list2 = []
tutorials_list = []
branches = []
# Record the start time
start_time = datetime.now()
# Read files from GitHub
repo = a.connect_repo(repo_name)
contents = repo.get_contents(path_in_repo, ref=repo_branch)
print(f"Starting search at {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
for content_file in contents:
# Check if the file is a markdown file
if content_file.path.endswith(".md"):
file = os.path.basename(content_file.path)
# Get the file content
file_content = content_file.decoded_content
lines = file_content.decode().splitlines()
blocks = []
count = 0
code_type = None
inside_code_block = False
for line in lines:
# snippets have ~\azureml-examples in them. Find all snippets in this file.
match_snippet = re.findall(r'\(~\/azureml-examples[^)]*\)|source="~\/azureml-examples[^"]*"', line)
if match_snippet:
for match in match_snippet:
path, ref_file, branch, match, name = h.cleanup_matches(match)
branches.append(branch)
if branch == az_ml_branch: #PRs are merged into main, so only these files are relevant
row_dict = {'ref_file': ref_file, 'from_file': file}
dict_list.append(row_dict)
# count lines in code snippets
blocks, inside_code_block, count, code_type = h.count_code_lines(line, blocks, inside_code_block, count, code_type)
# now look for tutorials that use a whole file
match_tutorial = re.search(r'nbstart\s+(.*?)\s+-->', line)
if match_tutorial:
file_name = match_tutorial.group(1)
tutorials_list.append({'tutorial': file_name, 'from_file': file })
# done looking through lines of this file
if inside_code_block:
print(f"{file}: Warning: A code block started but did not end.")
print(f" The last code block type was {code_type} and had {count} lines.")
if blocks:
# this file has code blocks. add info to the dictionary
for block in blocks:
dict_list2.append({'file': file, 'type': block[0], 'lines': block[1]})
code_counts = pd.DataFrame.from_dict(dict_list2)
code_counts.to_csv("code-counts.csv", index=False)
found = pd.DataFrame.from_dict(dict_list)
branches = pd.DataFrame(branches)
tutorials = pd.DataFrame(tutorials_list)
# get rid of duplicates
found = found.drop_duplicates()
branches = branches.drop_duplicates()
# sort the file
if not found.empty:
found = found.sort_values(by=['ref_file'])
else:
print("No references found")
sys.exit()
# write the snippets file
found.to_csv(result_fn, index=False)
# write the tutorials file
# these files won't break the build, so don't need to be in the CODEOWNERS file
# but we do want to track them in the dashboards
tutorials.to_csv(tutorials_fn, index=False)
# now create codeowners file
refs = found['ref_file'].drop_duplicates().replace(" ", "\ ", regex=True)
f = open(os.path.join(script_dir,'CODEOWNERS.txt'), 'w+')
for ref in refs:
f.write(f"/{ref} @sdgilley @msakande @Blackmist @ssalgadodev @lgayhardt @fbsolo-ms1 \n")
f.close()
# report the branches in use
print(f"References found in {repo_name} {repo_branch}:")
print (branches.to_string(index=False, header=False, justify='left'))
# Record the end time
end_time = datetime.now()
# Calculate the elapsed time
elapsed_time = end_time - start_time
# Print the elapsed time
print(f"\nTime elapsed: {elapsed_time/60} minutes")