-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsers.py
89 lines (69 loc) · 3.77 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import re
from pdfminer.high_level import extract_text
def find_sections(text):
# Define the regular expression pattern: sequence of one or two words beginning with \n and ending with an option : and \n
pattern = r'\n([A-Za-z]+\s?[A-Za-z]+(?=:?\n))'
# Find all matching substrings and their indices and lengths
matches = [(match.group(0), match.start(), len(match.group(0))) for match in re.finditer(pattern, text)]
all_matches = []
# Print the matches, their start indices, and lengths
for match, start_idx, length in matches:
all_matches.append(match[1:])
return all_matches
def extract_sections(text, resume_headers):
# Initialize dictionaries to store header information and sections.
headers_dict= {}
sections_dict = {}
# Iterate through a predefined list of 'resume_headers'.
for header in resume_headers:
# Define three possible header formats with different delimiters.
header_1 = '\n' + header + '\n'
header_2 = '\n' + header + ':'
header_3 = '\n' + header + ':\n'
# Check if any of the header formats are found in the 'text'.
if text.find(header_1) != -1:
# If found, store header information in 'headers_dict'.
headers_dict[header] = {'length': len(header_1), 'index': text.find(header_1)}
elif text.find(header_2) != -1:
headers_dict[header] = {'length': len(header_2), 'index': text.find(header_2)}
elif text.find(header_3) != -1:
headers_dict[header] = {'length': len(header_3), 'index': text.find(header_3)}
# Sort the 'headers_dict' based on the 'index' values.
sorted_headers_dict = dict(sorted(headers_dict.items(), key=lambda item: item[1]['index']))
# Extract the list of sorted headers.
headers = list(sorted_headers_dict.keys())
# Iterate through the sorted headers to extract sections of text between them.
for i in range(len(headers)):
header = headers[i]
if i < len(headers) - 1:
next_header = headers[i+1]
# Store the section of text between the current header and the next header.
sections_dict[header] = text[sorted_headers_dict[header]['index']+sorted_headers_dict[header]['length']:
sorted_headers_dict[next_header]['index']]
else:
# For the last header, store the text from the header to the end.
sections_dict[header] = text[sorted_headers_dict[header]['index']+sorted_headers_dict[header]['length']:]
# Return the 'sections_dict' containing extracted sections.
return sections_dict
def parse_section(section_text, section_key):
section_items = None
# Check if the 'section_text' contains neither periods nor commas.
if section_text.find('.') == -1 and section_text.find(',') == -1:
# Split the 'section_text' into items using newline characters.
section_items = section_text.split('\n')
# Check if there are no periods but there are commas or semicolons, or if 'Skill' is in 'section_key'.
elif (section_text.find('.') == -1 and (section_text.find(',') != -1 or section_text.find(';') != -1)) or 'Skill' in section_key:
# Split the 'section_text' using commas or semicolons as delimiters.
section_items = re.split(r'[,;]', section_text)
# Check if the 'section_text' contains periods.
elif section_text.find('.') != -1:
# Split the 'section_text' into items using periods as delimiters.
section_items = section_text.split('.')
else:
# If none of the above conditions are met, treat the entire 'section_text' as a single item.
section_items = [section_text]
# Clean up each item in 'section_items' by removing excess whitespace and newline characters.
section_items = [re.sub('\n+', ' ', item.strip()) for item in section_items if item.strip() != '' and item.strip() != '\n' and item.strip() != '\t']
# Return the cleaned 'section_items'.
return section_items