-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
114 lines (97 loc) · 4.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
from pdfminer.high_level import extract_text
import json
from parsers import parse_section
from encoders import encode, find_max_similarity
import pandas as pd
from tqdm import tqdm
from config import CONFIG_DICT
import numpy as np
parsed_resumes = {}
parsed_jds = {}
job_descs = pd.read_csv(CONFIG_DICT['job_description_csv'])
extracted_resumes = json.load(open(CONFIG_DICT['extracted_resumes']))
# The mapping dictionary stores the mapping from each important resume section title to the corresponding resume section titles
mapping_dict = json.load(open(CONFIG_DICT['mapping_dict']))
# Parsing job description sections
print("Parsing JDs...")
for i in tqdm(range(job_descs.shape[0])):
job_description = json.loads(job_descs['model_response'][i])
for jd_key in list(job_description.keys()):
if not i in parsed_jds.keys():
parsed_jds[i] = {}
if isinstance(job_description[jd_key], list):
parsed_jds[i][jd_key] = job_description[jd_key]
else:
parsed_jds[i][jd_key] = parse_section(job_description[jd_key], jd_key)
# Parsing resume sections
print("Parsing resumes...")
for resume_i in tqdm(list(extracted_resumes.keys())):
resume = extracted_resumes[resume_i]
for r_key in list(resume.keys()):
if not resume_i in parsed_resumes.keys():
parsed_resumes[resume_i] = {}
parsed_resumes[resume_i][r_key] = parse_section(resume[r_key], r_key)
# Encoding the parsed job descriptions and resumes
print("Encoding JDs...")
encoded_jds = encode(parsed_jds)
print("Encoding resumes...")
encoded_resumes = encode(parsed_resumes)
print("Matching resumes to JDs...")
# Calculating similarity between first jd_lim JDs and all resumes
result_dict = {}
# If a portion of the resumes have already been examined and matches stored, we add on to them
if os.path.exists(CONFIG_DICT['result_dict']):
result_dict = json.load(open(CONFIG_DICT['result_dict']))
for jd_key in tqdm(list(encoded_jds.keys())[CONFIG_DICT['jd_range_left']:CONFIG_DICT['jd_range_right']]):
for resume_i in list(encoded_resumes.keys()):
# Initializing sim_arr to store the maximum similarity for each section of the JD
sim_arr = []
for key in list(encoded_jds[jd_key].keys()):
# If the section is not part of our mapping dictionary, or has no text content, we skip it
if len(encoded_jds[jd_key][key]) == 0:
continue
if key not in list(mapping_dict.keys()):
continue
# Obtain the best corresponding resume sections and create a list of all their parsed parts
corr_keys = mapping_dict[key]
resume_encoding_corr = None
for r_key in corr_keys:
if r_key in list(encoded_resumes[resume_i].keys()):
if resume_encoding_corr is None and encoded_resumes[resume_i][r_key].shape[0] != 0:
resume_encoding_corr = encoded_resumes[resume_i][r_key]
else:
if encoded_resumes[resume_i][r_key].shape[0] == 0:
continue
resume_encoding_corr = np.concatenate((resume_encoding_corr, encoded_resumes[resume_i][r_key]))
# If the list is empty, then we score that resume 0 on that particular JD section
if resume_encoding_corr is None or resume_encoding_corr.shape[0] == 0:
sim_arr.append(0)
else:
sim_arr.append(find_max_similarity(encoded_jds[jd_key][key], resume_encoding_corr))
# We store the mean of the cosine similarity scores across all JD sections as our final score
if not jd_key in result_dict:
result_dict[jd_key] = {}
result_dict[jd_key][resume_i] = sum(sim_arr) / len(sim_arr)
else:
result_dict[jd_key][resume_i] = sum(sim_arr) / len(sim_arr)
# Sort the results in order of decreasing similarity score
for jd in result_dict.keys():
result_dict[jd] = dict(sorted(result_dict[jd].items(), key=lambda item: item[1], reverse=True))
top5_matches = {}
# Find the top5 matches for each JD
for jd in result_dict.keys():
if not jd in top5_matches.keys():
top5_matches[jd] = []
resume_list = list(result_dict[jd].keys())
top5_matches[jd] = resume_list[:5]
print("Saving results...")
# Save the results of top 5 matches as csv
result_df = pd.DataFrame.from_dict(top5_matches)
result_df = result_df.transpose()
result_df.to_csv(CONFIG_DICT['top5_result'], index=True, header=True)
# Save the complete results as a json for later examination/reuse
json_result = json.dumps(result_dict)
with open(CONFIG_DICT['result_dict'], 'w') as f:
f.write(json_result)
print("Finished!")