-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathquery_expansion.py
724 lines (541 loc) · 27.4 KB
/
query_expansion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from nltk.corpus import wordnet
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from aparts.src.deduplication import count_tag_occurrence
from aparts.src.subsampling import (assign_group, generate_binary_item_matrix,
transform_dataframe)
from itertools import product
def pca_biplot(X: np.ndarray, y, targets: list, features: list):
X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=2).fit(X_scaled)
X_reduced = pca.transform(X_scaled)
scores = X_reduced[:, :2]
loadings = pca.components_[:2].T
pvars = pca.explained_variance_ratio_[:2] * 100
k = 10
tops = (loadings * pvars).sum(axis=1).argsort()[-k:]
arrows = loadings[tops]
arrows /= np.sqrt((arrows ** 2).sum(axis=0))
plt.figure(figsize=(5, 5))
for i, name in enumerate(targets):
plt.scatter(*zip(*scores[y == y[i]]), label=name, s=8, alpha=0.5)
plt.legend(title='Class')
width = -0.005 * np.min([np.subtract(*plt.xlim()),
np.subtract(*plt.ylim())])
for i, arrow in zip(tops, arrows):
plt.arrow(0, 0, *arrow, color='k', alpha=0.75, width=width, ec='none',
length_includes_head=True)
plt.text(*(arrow * 1.15), features[i], ha='center', va='center')
for i, axis in enumerate('xy'):
getattr(plt, f'{axis}ticks')([])
getattr(plt, f'{axis}label')(f'PC{i + 1} ({pvars[i]:.2f}%)')
return
def pca_tags(CSV: str, variables: str, id: str, tag_length: int, number_of_records: int):
matrix, id_list = generate_binary_item_matrix(
CSV, variables, id, tag_length, number_of_records)
counts = count_tag_occurrence(matrix)
column = [item[0] for item in counts]
groups = assign_group(matrix, column)
group_list = set(groups)
X, y, targets, features = transform_dataframe(matrix, groups, group_list)
y = np.asarray(y)
pca_biplot(X, y, targets, features)
return
def merge_words_by_stem_and_wildcards(text: str, min_similarity=0.6, min_prefix_length=3) -> list:
"""
This function takes an input text containing comma-separated words and merges similar words by stem with wildcards
based on a similarity threshold.
Parameters:
-----------
text (str): The input text containing comma-separated words.
min_similarity (float, optional): The minimum similarity threshold to consider for merging words (default is 0.6).
min_prefix_length (int, optional): The minimum prefix length to consider for similarity (default is 3).
Returns:
-----------
merged_words (list of str): A list of merged words where similar words are merged by stem with wildcards.
"""
def calculate_similarity(word1, word2, min_prefix_length=min_prefix_length):
common_prefix = os.path.commonprefix([word1, word2])
if len(common_prefix) < min_prefix_length:
return 0.0
return len(common_prefix) / max(len(word1), len(word2))
# Tokenize the input text into words
words = [word.strip() for word in text.split(", ")]
merged_words = []
for word1 in words:
if word1 not in merged_words:
similar_words = [word1]
for word2 in words:
if word2 != word1 and word2 not in similar_words:
if calculate_similarity(word1, word2) >= min_similarity:
similar_words.append(word2)
if len(similar_words) > 1:
merged_word = similar_words[0]
for i in range(1, len(similar_words)):
j = 0
while j < len(merged_word) and j < len(similar_words[i]) and merged_word[j] == similar_words[i][j]:
j += 1
# Check if there's a space before the asterisk in the merged word
if j > 0 and merged_word[j - 1] == ' ':
# Append all similar words separately
merged_words.extend(similar_words)
break
else:
merged_word = merged_word[:j] + '*'
else:
merged_words.append(merged_word)
else:
merged_words.append(word1)
merged_words = set(merged_words)
return merged_words
def group_related_words(word_list: list[str]):
"""
Group words based on WordNet synsets without expanding the list with additional synonyms.
Parameters:
-----------
word_list (list): A list of words to be grouped.
Returns:
-----------
grouped_words (list): A list with their elements grouped by their synsets.
"""
synsets = {}
for word in word_list:
synsets[word] = set()
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synsets[word].add(lemma.name())
grouped_words = []
processed = set()
for word in word_list:
if word not in processed:
related = synsets[word]
group = [w for w in word_list if w in related and w != word]
group.append(word)
grouped_words.extend(group)
processed.update(group)
return list(grouped_words)
def group_fuzzy_words(word_list: list[str], threshold: int = 80):
"""
Group words based on fuzzy matches.
Parameters:
-----------
word_list (list): A list of words to be grouped.
threshold (int): fuzz ratio
Returns:
-----------
grouped_words (list): A list with their elements grouped by their similarity.
"""
grouped_words = []
processed = set()
for word in word_list:
if word not in processed:
group = [w for w in word_list if fuzz.ratio(
word, w) >= threshold and w != word]
group.append(word)
grouped_words.extend(group)
processed.update(group)
return list(grouped_words)
def expand_query_with_tag_similarity(query, tags, threshold=0.2, top_k=1):
"""
Expand a query based on tag similarity.
Parameters:
-----------
query (str): The original query to be expanded.
tags (list or set): A list or set of tags used for similarity comparison.
threshold (float, optional): The similarity threshold to consider for expansion (default is 0.2).
top_k (int, optional): The maximum number of terms to add to the query (default is 1).
Returns:
-----------
expanded_query (list): The expanded query with similar tags added.
"""
if isinstance(tags, set):
tags = list(tags)
vectorizer = TfidfVectorizer()
tag_matrix = vectorizer.fit_transform(tags)
query_vec = vectorizer.transform([query])
query_vec = query_vec.toarray()
tag_similarity = cosine_similarity(query_vec, tag_matrix)
if tag_similarity.shape[0] == 0:
return [query]
tag_similarity = tag_similarity[0]
sorted_indices = np.argsort(tag_similarity)[::-1]
sorted_terms = [tags[i] for i in sorted_indices]
expanded_terms = [term for term in sorted_terms if tag_similarity[tags.index(
term)] > threshold][:top_k]
return expanded_terms
def group_synonyms(words, fuzzy_threshold=""):
grouped_list = []
words_without_synonyms = []
def get_synsets(word):
return [synset for synset in wordnet.synsets(word) if ' ' not in word]
for i, word1 in enumerate(words):
synsets1 = get_synsets(word1)
if not synsets1:
words_without_synonyms.append(word1)
continue
for j in range(i + 1, len(words)):
word2 = words[j]
synsets2 = get_synsets(word2)
if not synsets2:
words_without_synonyms.append(word2)
continue
common_synonyms = set()
for synset1 in synsets1:
for synset2 in synsets2:
common_synonyms.update(
set(synset1.lemma_names()).intersection(synset2.lemma_names()))
if common_synonyms:
key = f"({word1} OR {word2})"
grouped_list.append(key)
elif fuzzy_threshold: # Check for fuzzy matches if treshold is supplied
similarity = fuzz.token_sort_ratio(word1, word2)
if similarity >= fuzzy_threshold:
key = f"({word1} OR {word2})"
grouped_list.append(key)
words_without_synonyms = list(set(words_without_synonyms))
grouped_list = grouped_list + words_without_synonyms
return grouped_list
def similarity_feedback(input: str, original_query: str = "Culex pipiens AND population dynamics", keyword_column: str = 'keywords', threshold: float = 0.2, top_k: int = 3) -> list[str]:
"""
Expand a query based on tag co-occurrence in the source csv.
Parameters:
-----------
query (str): The original query to be expanded.
tags (list or set): A list or set of tags used for similarity comparison.
tag_matrix (sparse matrix): A TF-IDF matrix representing tag vectors.
threshold (float, optional): The similarity threshold to consider for expansion (default is 0.2).
top_k (int, optional): The maximum number of terms to add to the query (default is 1).
Returns:
-----------
expanded_query (str): The expanded query with similar tags added.
"""
df = pd.read_csv(input)
df = df.dropna(subset=[keyword_column])
tags = set(df[keyword_column].tolist())
vectorizer = CountVectorizer()
vectorizer.fit(tags)
expanded_query = expand_query_with_tag_similarity(
original_query, tags, threshold, top_k)
return expanded_query
def pseudo_relevance_feedback(input: str, original_query: str, trainingset: tuple[int, int] = (50, 20), n_tags: int = 10, n_articles: int = 20, print_weights:bool = False) -> list[str]:
"""
Perform pseudo-relevance feedback on a collection of documents using the Rocchio algorithm on TF-IDF weights.
Parameters:
-----------
inputCSV (str): Path to the CSV file containing the document items (title and abstract) to scan.
original_query (str): The original query string for retrieval.
trainingset (tuple): A tuple specifying the percentage of documents to use as the "good" and "bad" training sets (ingroup and outgroup). The ingroup is taken from the top, the outgroup from the bottom.
n_tags (int): Number of top relevant tags to keep for query expansion.
n_articles (int): Number of top relevant articles to display.
Returns:
-----------
updated_query_str (str): The updated query string after pseudo-relevance feedback.
"""
df = pd.read_csv(input)
ingroup, outgroup = trainingset
length = df.shape[0]
ingroup_records = int((ingroup/100) * length)
outgroup_records = int((outgroup/100) * length)
good_matches = df.head(ingroup_records).copy()
bad_matches = df.tail(outgroup_records).copy()
good_matches['tags_str'] = good_matches['Keywords']
bad_matches['tags_str'] = bad_matches['Keywords']
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Keywords'])
centroid_good = np.mean(tfidf_matrix[:ingroup], axis=0)
centroid_bad = np.mean(tfidf_matrix[-outgroup:], axis=0)
alpha = 1.0
beta = 0.75
gamma = 0.25
expanded_query_vector = tfidf_vectorizer.transform([original_query])
updated_query = alpha * expanded_query_vector + \
beta * centroid_good - gamma * centroid_bad
updated_query_array = np.asarray(updated_query)
updated_query_tags = tfidf_vectorizer.inverse_transform(updated_query_array)[
0]
relevant_tags = [(tag, weight) for tag, weight in zip(
updated_query_tags, updated_query_array[0])]
relevant_tags.sort(key=lambda x: x[1], reverse=True)
top_relevant_tags = [tag for tag, _ in relevant_tags][:n_tags]
if print_weights:
print("Relevant Tags with TF-IDF Weights:")
for tag, weight in relevant_tags[:n_tags]:
print(f"{tag}: {weight}")
cosine_similarities = cosine_similarity(updated_query_array, tfidf_matrix)
df['cosine_similarity'] = cosine_similarities[0]
df = df.sort_values(by='cosine_similarity', ascending=False)
top_articles = df.head(n_articles)
return top_relevant_tags, top_articles
def count_title_matches(file1_path, file2_path, file1_column, file2_column, show_missing: bool = False):
"""
Count the number of matching titles between two files.
Parameters:
-----------
file1_path (str): Path to the first file.
file2_path (str): Path to the second file.
file1_column (str): Column containing titles in the first file.
file2_column (str): Column containing titles in the second file.
show_missing (bool, optional): Whether to display titles that are in the first file but not in the second file (default is False).
Returns:
-----------
None
"""
file1_extension = file1_path.split('.')[-1]
file2_extension = file2_path.split('.')[-1]
if file1_extension == 'csv':
df1 = pd.read_csv(file1_path)
elif file1_extension in ['xls', 'xlsx']:
df1 = pd.read_excel(file1_path)
else:
raise ValueError(
f"Unsupported file format for {file1_path}. Use CSV or Excel files.")
if file2_extension == 'csv':
df2 = pd.read_csv(file2_path)
elif file2_extension in ['xls', 'xlsx']:
df2 = pd.read_excel(file2_path)
else:
raise ValueError(
f"Unsupported file format for {file2_path}. Use CSV or Excel files.")
titles1 = df1[file1_column].astype(str)
titles2 = df2[file2_column].astype(str)
match_count = titles1.isin(titles2).sum()
not_in_titles2 = titles1[~titles1.isin(titles2)]
if show_missing:
[print(item) for item in not_in_titles2]
total_titles1 = len(titles1)
total_titles2 = len(titles2)
print(f"{match_count} out of {total_titles1} matches.")
return match_count, total_titles2
def count_title_matches_from_list(file1_path: str, selected_list: list, file1_column: str, show_missing: bool = False, show_score: bool = False):
"""
Count the number of matching titles between a file and a selected list of titles.
Parameters:
-----------
file1_path (str): Path to the file.
selected_list (list): List of titles for comparison.
file1_column (str): Column containing titles in the file.
show_missing (bool, optional): Whether to display titles that are in the file but not in the selected list (default is False).
Returns:
-----------
None
"""
file1_extension = file1_path.split('.')[-1]
if file1_extension == 'csv':
df1 = pd.read_csv(file1_path)
elif file1_extension in ['xls', 'xlsx']:
df1 = pd.read_excel(file1_path)
else:
raise ValueError(
f"Unsupported file format for {file1_path}. Use CSV or Excel files.")
titles1 = df1[file1_column].astype(str)
titles2 = selected_list
match_count = titles1.isin(titles2).sum()
not_in_titles2 = titles1[~titles1.isin(titles2)]
if show_missing:
[print(item) for item in not_in_titles2]
total_titles1 = len(titles1)
total_titles2 = len(titles2)
if show_score:
print(f"{match_count} out of {total_titles1} matches.")
return match_count
def emulate_query(query: str, file_path: str, title_column: str, abstract_column: str) -> list:
"""
Emulate a query on a dataframe based on title and abstract columns.
Parameters:
-----------
query (str): Query string to emulate.
file_path (str): Path to the file containing data.
title_column (str): Column containing titles in the file.
abstract_column (str): Column containing abstracts in the file.
Returns:
-----------
filtered_titles (list): List of titles matching the query.
"""
df = pd.read_excel(file_path)
query_list = query.replace("(", "").replace(")", "").replace(
" OR ", "|").replace("*", "").split(" AND ")
query_list = ["(?:"+item+")" for item in query_list]
query_list = ' AND '.join(query_list).replace(" AND ", ")(?=.*")
expr = '(?i)(?=.*{})'
regex_pattern = r'^{}'.format(''.join(expr.format(query_list)))
title_matches = df[title_column].str.contains(
regex_pattern, regex=True, case=False)
abstract_matches = df[abstract_column].str.contains(
regex_pattern, regex=True, case=False)
filtered_df = df[title_matches | abstract_matches]
filtered_titles = filtered_df[title_column].tolist()
return filtered_titles
def test_query(query: str, target_file: str, source_file: str, target_title_column: str, source_title_column: str, source_abstract_column: str) -> list:
"""
Test overlap for a given query in a source file against a target file.
Parameters:
-----------
query (str): The query string to be verified.
target_file (str): Path to the target file containing the records that should be matched.
source_file (str): Path to the source file containing the records to be searched.
test_file (str): Path to the test file acquired from a search engine using the same query.
target_title_column (str): Column containing titles in the target file.
source_title_column (str): Column containing titles in the source file.
source_abstract_column (str): Column containing abstracts in the source file.
Returns:
-----------
filtered_titles (list): List of titles matching the query in the test file.
Example:
-----------
verify_query(query = "forest* AND (tropic* OR neotropic*) AND (climber* OR liana* OR vine*) AND (trend* OR change*) AND (ground OR climate OR hyperspectral OR accretion OR precipitation OR reproduction OR environmental)", target_file='D:/Users/Sam/Downloads/lianas_oct24.csv', test_file='D:/Users/Sam/Downloads/savedrecs(6).xls', target_title_column='title', test_title_column='Article Title', test_abstract_column='Abstract')
"""
filtered_titles = emulate_query(
query, source_file, source_title_column, source_abstract_column)
score = count_title_matches_from_list(
target_file, filtered_titles, target_title_column)
return score, len(filtered_titles), filtered_titles
def verify_query(query: str, target_file: str, source_file: str, test_file: str, target_title_column: str, test_title_column: str, source_title_column: str, source_abstract_column: str) -> list:
"""
Verify behavior for a given query against a target and test files.
Parameters:
-----------
query (str): The query string to be verified.
target_file (str): Path to the target file.
source_file (str): Path to the source file.
test_file (str): Path to the test file acquired from a search engine using the same query.
target_title_column (str): Column containing titles in the target file.
test_title_column (str): Column containing titles in the test file.
source_title_column (str): Column containing titles in the source file.
source_abstract_column (str): Column containing abstracts in the source file.
Returns:
-----------
filtered_titles (list): List of titles matching the query in the test file.
Example:
-----------
verify_query(query = "forest* AND (tropic* OR neotropic*) AND (climber* OR liana* OR vine*) AND (trend* OR change*) AND (ground OR climate OR hyperspectral OR accretion OR precipitation OR reproduction OR environmental)", target_file='D:/Users/Sam/Downloads/lianas_oct24.csv', source_file='D:/Users/Sam/Downloads/lianas_original.xls', test_file='D:/Users/Sam/Downloads/savedrecs(6).xls', target_title_column='title', test_title_column='Article Title', source_title_column='Article Title', source_abstract_column='Abstract')
"""
filtered_titles = emulate_query(
query, source_file, source_title_column, source_abstract_column)
count_title_matches_from_list(
target_file, filtered_titles, target_title_column)
count_title_matches(target_file, test_file,
target_title_column, test_title_column)
return filtered_titles
def generate_query_combinations(items: list) -> list[str]:
"""Generate all possible combinations from a list"""
combinations = [' OR '.join(list(set(combination)))
for combination in product(items, repeat=len(items))]
combinations = [
f'({combination})' for combination in combinations if ' OR ' in combination]
combinations.extend(items)
combinations = list(set(combinations))
and_combinations = [
f'{combination} AND {item}' for item in items for combination in combinations if item not in combination]
and_combinations = list(set(and_combinations))
combinations.extend(and_combinations)
combinations = list(set(combinations))
return combinations
def generate_queries(query: str, items: list):
query_list = []
combinations = generate_query_combinations(items)
for item in combinations:
if query != "":
updated_query = query + ' AND ' + item
else:
updated_query = item
query_list.append(updated_query)
return query_list
def find_optimal_query(query_list, target_file: str, source_file: str, target_title_column: str, source_title_column: str, source_abstract_column: str, max_matches: int) -> tuple[str, int]:
highest_score = float('-inf')
lowest_matches = float('inf')
best_query = ""
for query in query_list:
score, matches, _ = test_query(
query, target_file, source_file, target_title_column, source_title_column, source_abstract_column)
if matches < max_matches:
if score >= (highest_score - 2):
highest_score = score
lowest_matches = matches
best_query = query
return best_query, highest_score, lowest_matches
def select_relevant_tags(source_file: str, tag_column: str, record_amount: int) -> list:
df = pd.read_csv(source_file)
tags = df.head(record_amount)[tag_column]
tags_list = ', '.join(list(tags))
tags_list = tags_list.split(', ')
return tags_list
def auto_optimize_query(query: str, extra_tags: str, source_file: str, tag_file: str, tag_column: str, record_amount: int, target_file: str, target_title_column: str, source_title_column: str, source_abstract_column: str, max_matches: int):
relevant_tags = []
if record_amount != "":
relevant_tags = select_relevant_tags(tag_file, tag_column, record_amount)
if extra_tags != "":
relevant_tags = relevant_tags + extra_tags
non_relevant_tags = ['forest', 'tropic',
'climber', 'liana', 'vine', 'trend', 'change']
for word in non_relevant_tags:
relevant_tags = [item for item in relevant_tags if word not in item]
query_list = relevant_tags.copy()
best_query, highest_score, lowest_matches = find_optimal_query(
query_list=query_list,
target_file=target_file,
source_file=source_file,
target_title_column=target_title_column,
source_title_column=source_title_column,
source_abstract_column=source_abstract_column,
max_matches=max_matches
)
print(
f"The current query is: {best_query} with a score of {highest_score}/{lowest_matches}")
while True:
higher_score = highest_score
lower_matches = lowest_matches
selected_word = best_query.replace(relevant_tags[0], "")
relevant_tags = [
item for item in relevant_tags if item != selected_word]
new_queries = []
for item in relevant_tags:
word_list = [best_query, item]
generated_queries = generate_queries("", word_list)
new_queries.extend(generated_queries)
query_list = list(set(new_queries))
better_query, higher_score, lower_matches = find_optimal_query(
query_list=query_list,
target_file=target_file,
source_file=source_file,
target_title_column=target_title_column,
source_title_column=source_title_column,
source_abstract_column=source_abstract_column,
max_matches=lowest_matches
)
better_query = better_query.replace(') OR', ' OR').replace(
'OR (', 'OR ').replace('))', ')').replace('((', '(').replace(' ', ' ')
print(
f"The current query is: {better_query} with a score of {higher_score}/{lower_matches}")
if higher_score < highest_score or lower_matches >= lowest_matches:
break
best_query = query + " AND " + better_query
highest_score = higher_score
lowest_matches = lower_matches
print(
f"The best query is: {best_query} with a score of {highest_score}/{lowest_matches}")
return best_query, highest_score, lowest_matches
def propose_tags(original_query:str, input_file:str, keyword_column:str, trainingset:tuple[int,int], n_tags:int, threshold:float):
expanded_query1, top_articles = pseudo_relevance_feedback(
input=input_file, trainingset=trainingset, n_tags=n_tags)
expanded_query2 = similarity_feedback(input=input_file, original_query=original_query, keyword_column=keyword_column, threshold=threshold)
expanded_query = expanded_query1 + expanded_query2
expanded_query = ', '.join(expanded_query)
expanded_terms = merge_words_by_stem_and_wildcards(expanded_query)
expanded_terms = group_fuzzy_words(expanded_terms, 80)
expanded_terms = group_related_words(expanded_terms)
merged_terms = group_synonyms(expanded_terms)
if __name__ == "__main__":
pca_tags("C:/NLPvenv/NLP/output/csv/savedrecs_lianas_sorted.csv", "Keywords", "Article Title", 4, 35)
#original_query = "forest* AND tropic* AND (climber* OR liana* OR vine*) AND (trend* OR change*)"
#merged_terms = propose_tags(original_query=original_query, input_file="C:/NLPvenv/NLP/output/csv/savedrecs_lianas_sorted.csv", keyword_column='Keywords', original_query=original_query, trainingset=(30, 40), n_tags=20, threshold=0.2)
#best_query, highest_score, lowest_matches = auto_optimize_query(query="forest* AND tropic* AND (climber* OR liana* OR vine*) AND (trend* OR change*)", extra_tags=merged_terms, tag_file="C:/NLPvenv/NLP/output/csv/savedrecs_lianas_sorted.csv", source_file='D:/Users/Sam/Downloads/lianas_original.xls', tag_column="Keywords", record_amount="", target_file='D:/Users/Sam/Downloads/lianas_oct24.csv', target_title_column='title', source_title_column='Article Title', source_abstract_column='Abstract', max_matches=254)
#count_title_matches('D:/Users/Sam/Downloads/lianas_oct24.csv', 'D:/Users/Sam/Downloads/savedrecs(6).xls', 'title', 'Article Title')
# visualize tag clustering
# correct for dependence among tags