forked from lowrank/grading_tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutility.py
164 lines (135 loc) · 5.32 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Utility functions.
"""
import os
import re
import zipfile
import subprocess
import psutil
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from pyvis.network import Network
import networkx as nx
def kill(proc_pid):
"""
kill the process with the given PID
"""
process = psutil.Process(proc_pid)
for proc in process.children(recursive=True):
proc.kill()
process.kill()
def execute_system_call(command, max_wait=30):
"""
Execute a system call and return the output
"""
with subprocess.Popen(command,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
text = True,
shell = False
) as process:
try:
std_out, std_err = process.communicate(timeout=max_wait)
output = ''.join(re.findall('PASS|FAIL', std_out.strip()))
if std_err:
output += ' {{Implementation Error}}@[' + \
std_err.strip().replace('\n','').replace(',', '') + ']'
return output
except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as err:
if isinstance(err, subprocess.TimeoutExpired):
kill(process.pid)
return ' {{TimeOut Error}} '
return ' {{RunTime Error}} '
def find_emails(text):
"""
Find all email addresses in the text using regular expressions.
"""
# Define the regular expression pattern for matching email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
# Find all email addresses in the text using the regular expression pattern
emails = re.findall(email_pattern, text)
return emails
def extract_link(link_file_path):
"""
Extract HTML content to a link using BeautifulSoup.
"""
with open(link_file_path, 'r', encoding='utf-8') as link_file:
index = link_file.read()
link = BeautifulSoup(index, 'lxml').body.a['href']
if link.endswith('.git'):
return link[:-4]
if link.find('blob') != -1:
return link[:link.find('blob')]
if link.find('tree') != -1:
return link[:link.find('tree')]
return link
def unzip(file, file_dir, skip_dir=True):
"""
Unzip the submission file
"""
if not skip_dir:
with zipfile.ZipFile(file, 'r') as zip_ref:
zip_ref.extractall(file_dir)
else:
with zipfile.ZipFile(file, 'r') as zip_ref:
for zip_info in zip_ref.infolist():
if zip_info.is_dir():
continue
zip_info.filename = os.path.basename(zip_info.filename)
zip_ref.extract(zip_info, file_dir)
def remove_duplicates(csv_file):
"""
Remove duplicate IDs (remain the maximum score value) in the CSV file with panda
"""
# Read the CSV file
data_frame = pd.read_csv(csv_file)
# Remove duplicate IDs (remain the maximum score value)
data_frame = data_frame.sort_values('Score', ascending=False)\
.drop_duplicates('ID').sort_values('ID')
# Write the updated data to the CSV file
data_frame.to_csv(csv_file, index=False)
def detect_similarity(submission_dir, hw_str, threshold):
"""
check the similarity between files under a directory.
"""
matlab_documents = []
matlab_users = []
python_documents = []
python_users = []
for student_dir in os.listdir(submission_dir):
if os.path.isdir(os.path.join(submission_dir, student_dir)):
if os.path.exists(os.path.join(submission_dir, student_dir, hw_str + '.m')):
with open(os.path.join(submission_dir, student_dir, hw_str + '.m'),
'r', encoding='utf-8') as file:
matlab_documents.append( file.read() )
matlab_users.append(student_dir)
if os.path.exists(os.path.join(submission_dir, student_dir, hw_str + '.py')):
with open(os.path.join(submission_dir, student_dir, hw_str + '.py'),
'r', encoding='utf-8') as file:
python_documents.append( file.read() )
python_users.append(student_dir)
network = Network('1200px', '1200px')
g_matlab = check_similarity(matlab_documents, matlab_users, threshold)
nx.draw(g_matlab, with_labels = True)
network.from_nx(g_matlab)
network.show('matlab.html', notebook=False)
network = Network('1200px', '1200px')
g_python = check_similarity(python_documents, python_users, threshold)
nx.draw(g_python, with_labels = True)
network.from_nx(g_python)
network.show('python.html', notebook=False)
def check_similarity(documents, users, threshold):
"""
check the similarity between documents.
"""
tfidf = TfidfVectorizer().fit_transform(documents)
pairwise_similarity = tfidf * tfidf.T
graph_similarity = nx.Graph()
for i, user_i in enumerate(users):
for j, user_j in enumerate(users):
if i > j and pairwise_similarity[i, j] > threshold:
graph_similarity.add_edge(user_i.split('_')[0],
user_j.split('_')[0],
title=str(pairwise_similarity[i, j]))
return graph_similarity