-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfIdf.py
42 lines (40 loc) · 1.4 KB
/
tfIdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import csv
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np
file = open('tabnak150Page.csv', encoding="utf8")
type(file)
csvreader = csv.reader(file)
docs = []
for row in csvreader:
docs.append(row[2])
# words = []
# for d in docs:
# words.extend(d.split())
# vocab = np.array(words)
# vocab = np.unique(vocab)
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(docs)
tfidf_docs = vectorizer.fit_transform(docs)
vectorizer.vocabulary_
docs_tfidf = vectorizer.transform(docs)
type(docs_tfidf), docs_tfidf.shape
z = docs_tfidf[0].A
def quaryPro(query):
tfidf_query = vectorizer.transform([query])[0]
cosines = []
for d in tqdm(tfidf_docs):
cosines.append(float(cosine_similarity(d, tfidf_query)))
k = 10
sorted_ids = np.argsort(cosines)
for i in range(k):
cur_id = sorted_ids[-i-1]
print('----------------------------------------------',cosines[cur_id],'--------------------------------------------------------')
print(docs[cur_id])
while True:
print('###################################################وارد کنید','################################################')
query = input()
quaryPro(query)
print('----------------------------------------------------------------------------------------------------------------')