-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathETF_DB_Batch.py
341 lines (267 loc) · 15.8 KB
/
ETF_DB_Batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python
# coding: utf-8
# # ETF
# In[11]:
import requests
import sys
import pandas as pd
import numpy as np
import pymssql
import re
from tqdm.notebook import tqdm
import datetime
from datetime import timedelta
import warnings
warnings.filterwarnings(action='ignore')
now = datetime.datetime.now() - timedelta(1)
date = now.strftime('%Y-%m-%d')
if (now.weekday() == 5) | (now.weekday() == 6):
next
else:
print(date)
conn = pymssql.connect(host=r"10.101.34.221", user='sa', password='vmfkdla2006', database='BWPRIME',
charset='UTF-8')
cursor = conn.cursor()
sql = f"SELECT * FROM NEWS_LIST_NAVER WHERE NEWS_DATE BETWEEN '{date} 00:00:00' AND '{date} 23:59:59.997'"
cursor.execute(sql)
rows = cursor.fetchall()
conn.commit()
conn.close()
print('----------------------뉴스 불러오기----------------------')
col = ['IDX', 'CATEGORY', 'NEWS_DATE', 'TITLE', 'clean_content',
'URL_DETAIL', 'final_title', 'final_content',
'positive_score_mean', 'negative_score_mean', 'positive_score_sum',
'negative_score_sum', 'news_sentiment', 'news_label', 'good', 'bad', 'source']
df = pd.DataFrame(np.array(rows), columns=col)
# 인코딩
title = []
content = []
source = []
final_title = []
final_content = []
error = []
for i in range(len(df)):
try:
title.append(df.iloc[i, 3].encode('iso-8859-1').decode('euc-kr'))
except:
title.append(0)
error.append(i)
for i in range(len(df)):
try:
content.append(df.iloc[i, 4].encode('iso-8859-1').decode('euc-kr'))
except:
content.append(0)
error.append(i)
for i in range(len(df)):
try:
source.append(df.iloc[i, 1].encode('iso-8859-1').decode('euc-kr'))
except:
source.append(0)
error.append(i)
for i in range(len(df)):
try:
final_title.append(df.iloc[i, 6].encode('iso-8859-1').decode('euc-kr'))
except:
final_title.append(0)
error.append(i)
for i in range(len(df)):
try:
final_content.append(df.iloc[i, 7].encode('iso-8859-1').decode('euc-kr'))
except:
final_content.append(0)
error.append(i)
len(title), len(content), len(source), len(df)
error = list(set(error))
df['TITLE'] = title
df['clean_content'] = content
df['CATEGORY'] = source
df['final_title'] = final_title
df['final_content'] = final_content
df = df.drop(error).reset_index(drop=True)
news_week = df.sort_values('NEWS_DATE', ascending=False).reset_index(drop=True)
stopword = [
'아직도', '추천주', '무단전재', '재배포', '금지', '및', '돈내고', '받으세요', '클릭', '이젠', '카톡방', '받는다', '아직', '무료', '한국경제',
'아시아경제', 'Global Economy', '이데일리', '매일경제', '연합뉴스', '정보통신신문', '아이투자', '헬스코리아', '뉴스와이어', '델톤', '.com', '아이투자',
'저작권자 ',
'www.itooza.com', '무단전재', '재배포', '금지', '한국경제', '무료', '카톡방', '추천주', '추천', '클릭', '번호', '하루만에', '신청즉시', '관심종목',
'뉴스', '대장주',
'즉시참여하기', '신청폭주상', '빠른신청은', '카운트 다운', '종목명', '계좌', '한국경제신문', '금지', '기자', 'hankyung.com', '아시아경제',
'ioy asiae.co.kr', 'quot',
'받으세요', '광고', '뉴스카페', 'ⓒ', 'ace', '매경이코노미', '매일경제', '무단전재 및 재배포 금지 ', 'mk.co.kr', '\\n☞', 'asiae.co.kr',
'종합 경제정보', '뉴스 앱',
'속보', '주목하세요', '적중', '터질', '종목', '주목', '카카오톡', '드립니다', '기사', '관련', '무조건', '대박', '비율', '연', '월', '일', '훨훨',
'따라오', '체험', '선착순',
'특급', '터질', '오늘', '초보', '기업', '확인', '증권가', '무조건', '연도', '명이', '지금', '당장', '이것', '사라', '주린', '리딩', '평생', '바로',
'입장', '남발', '여의도',
'궁금', '체험', '한가', '폭탄', '한경', '닷컴', '정보', '투자', '방송', '전문가', '종합', '주식', '역대', '분기', '금융', '신청', '실시간', '당신',
'미디어', '합니다',
'시험', '탁론', '모바일', '플러스', '본드', '마켓', '언제', '트래블러', '어디', '생활', '신문', '포인트', '위한', '분야', '기반', '통해', '위해', '한다',
'그룹',
'브랜드', '스타', '된다', '위원회', '해야', '경우', '대한', '때문', '국민', '따르', '연구원', '수수료', '대출', '없이', '증권사', '신용', '저작권자',
'컨텐츠',
'최신', '입니다', '즉시', '문자', '발송', '내일', '상담', '링크', '고민', '타이밍', '여러분', '이제', '모르', '마감', '간편', '한국', '즐거움', '온라인',
'기관', '대비',
'알고리즘', '그래프', '최근', '작성', '해당', '관심', '근거', '참고', '포착', '대해', '인공', '마지막', '최근', '코리아', '스타일', '보도', '직원',
'황제', '탄생', '행진',
'트렌드', '진단', '이날', '전년', '부문', '대표', '비실비실대다', 'i레터', '불리우는', '대국민', '비실비실대', '레터', '옴디아', '무단', '전재', '배포',
'관련주', '련주', '카드뉴스',
'피쉬', '칩스', '신기록(종합2보)', '연합뉴스', '에프앤가이드', 'MARKETPOINT', '명예', '전당', '위험', '등급', '가격', '신탁', '증권', '목적', '신탁',
'수익', '자산', '채권', '위험', '지수', '증권', '집합', '추구', '목적', '전략', '운용', '등급', '변동', '달성', '보장', '상품', '업자', '이상',
'시장', '대상', '재산', '국내', '기준',
'파생', '기구', '펀드', '수준', '비교', '가격', '주가', '손실', '회사', '평가', '기초', '상환', '안정', '가치', '장외', '포트폴리오', '가능', '조기',
'해외', '분석', '비중', '구성', '변경',
'TIGER', 'KODEX', 'KINDEX', 'KBSTAR', 'TIMEFOLIO', 'ARIRANG', 'HANARO', '흥국', 'SMART', 'TREX', '마이티', '파워',
'마이다스', 'FOCUS', 'KOSEF', 'FnGuide', '산출',
'상장', '제공', '최대', '거래소', '거래', '상장지수', '주요', '대형', '매도', '계획', '편입', '제외', '수단', '실행', '연동', '조정', '투자자', '시간',
'and', '보고서', '현지',
'고객', '공개', '최소한', '거래일', '제시', '판단', '시점', '증시', '경제', '이후', '금액', '발표', '국가', '이유', '시가총액', '기간', '미만', '전체',
'규모', '시작', '전날',
'자료', '조사', '관리', '필요', '영업', '일반', '다양', '신규', '이벤트', '가운데', '목표', '가입', '전망', '지급', '차지', '집중', '실적', '예상',
'기대', '고려', 'vs', '일평균', 'star',
'벤트', '실시', '한화', 'nh', '지원', '타임', 'fn', '최초', '원칙'
]
from eunjeon import Mecab
mecab = Mecab('C:/mecab/mecab-ko-dic')
# tag_classes = ['NNG', 'NNP','VA', 'VV+EC', 'XSV+EP', 'XSV+EF', 'XSV+EC', 'VV+ETM', 'MAG', 'MAJ', 'NP', 'NNBC', 'IC', 'XR', 'VA+EC','SL']
# stopword = ['TIGER', 'KODEX', 'KINDEX', 'KBSTAR', 'TIMEFOLIO', 'ARIRANG', 'HANARO', '흥국', 'SMART', 'TREX', '마이티', '파워', '마이다스', 'FOCUS', 'KOSEF']
def extract_noun_mecab_etf(text): # 형태소중 '조사'를 제외하고 clean_word에 추가.
clean_word = []
result = mecab.pos(text)
for word, tag in result:
if tag.startswith('N') or tag.startswith('SL'):
if (word not in stopword) and len(word) > 1:
clean_word.append(word)
return clean_word
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_count(text, max_features):
tfidf = TfidfVectorizer(max_features=max_features, tokenizer=extract_noun_mecab_etf, sublinear_tf=True)
tdm = tfidf.fit_transform(text)
word_count_tfidf = pd.DataFrame({
'단어': tfidf.get_feature_names(),
'빈도': tdm.sum(axis=0).flat
})
return word_count_tfidf
def match_word(w):
matching = w
try:
matching = [s for s in news_week[news_week['final_title'].str.contains(w)]['TITLE'].str.split().values[0] if
w in s][0]
matching = re.sub('[=+,#-/”“▶▲●◆■◀\?.:^$’@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》☞△&·‥""]', ' ', matching).split()
matching = [s for s in matching if w in s][0]
except:
pass
return matching
print('----------------------keyword 추출----------------------')
word = tfidf_count(news_week.final_title, 50).sort_values('빈도', ascending=False).reset_index().iloc[:, 1:]
# word['fund_name'] = np.NaN
# word.단어 = word.단어.replace('버스', '인버스').replace('차이', '차이나').replace('러시', '러시아').replace('우드', '클라우드').replace('바이',
# '바이오').replace(
# '이노', '이노베이션').replace('상하', '상하이').replace('플랫', '플랫폼').replace('어주', '방어주').replace('고배', '고배당').replace('버리',
# '레버리지')
word.단어 = word.단어.apply(lambda x: match_word(x))
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
word['빈도'] = (mm.fit_transform(word[['빈도']])) * 100
word = word.reset_index(drop=True)
# 데일리 키워드 추출
daily_keyword = pd.DataFrame(columns=['DATE', 'TYPE', 'KEYWORD', 'SCORE', 'JM_NM', 'JM_CD', 'FUND_CD'])
daily_keyword[['KEYWORD', 'SCORE']] = word[1:7]
daily_keyword['DATE'] = now.strftime('%Y-%m-%d')
daily_keyword['TYPE'] = 'ETF'
## ETF 관련 기사 키워드 추출
etf_news = news_week[news_week.TITLE.str.contains('ETF')]
etf_news['final_title'] = etf_news['final_title'].apply(
lambda x: x.replace('자산', '').replace('미래', '').replace('운용', '').replace('지수', '').replace('순위', '').replace(
'등록', '')
.replace('순자', '').replace('KB', '').replace('상위', '').replace('거래소', '').replace('정책', '').replace('고배',
'고배당').replace(
'우량주', '우량')
.replace('거래', ''))
def match_word(w):
matching = w
try:
matching = [s for s in etf_news[etf_news['final_title'].str.contains(w)]['TITLE'].str.split().values[0] if
w in s][0]
matching = re.sub('[=+,#-/”“▶▲●◆■◀\?.:^$’@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》☞△&·‥""]', ' ', matching).split()
matching = [s for s in matching if w in s][0]
except:
pass
return matching
word = tfidf_count(etf_news[etf_news.final_content.str.contains('ETF')].final_title, 50).sort_values('빈도',
ascending=False).reset_index().iloc[
:, 1:]
word.단어 = word.단어.apply(lambda x: match_word(x))
# word['fund_name'] = np.NaN
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
word['빈도'] = (mm.fit_transform(word[['빈도']]) + 0.05) * 100
word = word[word['단어'] != 'etf'].reset_index(drop=True)
## 관련 ETF 상품 가져오기
etf_pre = pd.read_csv('C:/Users/남건우/Desktop/batch/data/etf_pre.csv')
for i in tqdm(range(len(word))):
f = etf_pre[etf_pre['clean_remark'].str.contains(word.loc[i, '단어'])][
['JM_CD', 'JM_NM', 'FUND_CD']].drop_duplicates(subset=['JM_CD', 'FUND_CD'])
JM_CD = ' / '.join(str(j) for j in f['JM_CD'][:10])
FUND_CD = ' / '.join(str(j) for j in f['FUND_CD'][:10])
JM_NM = ' / '.join(str(j) for j in f['JM_NM'][:10])
word.loc[i, 'JM_NM'] = JM_NM
word.loc[i, 'JM_CD'] = JM_CD
word.loc[i, 'FUND_CD'] = FUND_CD
# etf_keyword = word[~word.JM_NM.isnull()].reset_index(drop=True)
etf_keyword = word[[len(word.loc[i, 'JM_NM'].split(' / ')) > 1 for i in range(len(word))]].reset_index(drop=True)
etf_keyword.rename(columns={'단어': 'KEYWORD', '빈도': 'SCORE'}, inplace=True)
etf_keyword['DATE'] = now.strftime('%Y-%m-%d')
etf_keyword['TYPE'] = 'ETF'
etf_keyword = etf_keyword[['DATE', 'TYPE', 'KEYWORD', 'SCORE', 'JM_NM', 'JM_CD', 'FUND_CD']][:6]
# etf_keyword = etf_keyword.replace('버스','인버스').replace('차이','차이나').replace('러시','러시아').replace('우드','클라우드').replace('바이','바이오').replace('이노','이노베이션').replace('상하','상하이').replace('플랫','플랫폼').replace('어주','방어주').replace('고배','고배당').replace('버리','레버리지').replace('신재','신재생').replace('리티','모빌리티')
etf_keyword
ff = []
for i in range(len(etf_keyword)):
k = etf_keyword.iloc[i, 2]
for j in range(len(etf_keyword.iloc[i, 4].split(' / '))):
ff.append([k, etf_keyword.iloc[i, 5].split(' / ')[j], etf_keyword.iloc[i, 6].split(' / ')[j],
etf_keyword.iloc[i, 4].split(' / ')[j]])
etf_keyword_dupl = pd.DataFrame(ff, columns=['KEYWORD', 'JM_CD', 'FUND_CD', 'JM_NM'])
etf_keyword_dupl['DATE'] = etf_keyword['DATE'][0]
etf_keyword_dupl['TYPE'] = etf_keyword['TYPE'][0]
etf_keyword_dupl = etf_keyword_dupl[['DATE', 'TYPE', 'KEYWORD', 'JM_CD', 'FUND_CD', 'JM_NM']]
print(etf_keyword_dupl)
daily_keyword = daily_keyword[~daily_keyword['KEYWORD'].isin(etf_keyword['KEYWORD'])]
print(daily_keyword)
# keyword 삽입
print('----------------------keyword 삽입----------------------')
conn = pymssql.connect(host=r"10.101.34.221", user='sa', password='vmfkdla2006', database='BWPRIME', charset='UTF-8')
cursor = conn.cursor()
for i in etf_keyword.values:
try:
insert_query = "INSERT INTO KEYWORD_ETF (DATE, TYPE, KEYWORD, SCORE, JM_NM, JM_CD, FUND_CD) VALUES (%s, %s, %s, %s, %s, %s, %s)"
sql_data = tuple(i)
cursor.execute(insert_query, sql_data)
conn.commit()
except:
pass
conn.close()
conn = pymssql.connect(host=r"10.101.34.221", user='sa', password='vmfkdla2006', database='BWPRIME', charset='UTF-8')
cursor = conn.cursor()
for i in daily_keyword.values:
try:
insert_query = "INSERT INTO KEYWORD_ETF (DATE, TYPE, KEYWORD, SCORE) VALUES (%s, %s, %s, %s)"
sql_data = tuple(i)
cursor.execute(insert_query, sql_data)
conn.commit()
except:
pass
conn.close()
conn = pymssql.connect(host=r"10.101.34.221", user='sa', password='vmfkdla2006', database='BWPRIME', charset='UTF-8')
cursor = conn.cursor()
for i in etf_keyword_dupl.values:
try:
insert_query = "INSERT INTO KEYWORD_ETF_JM (DATE, TYPE, KEYWORD, JM_CD, FUND_CD, JM_NM) VALUES (%s, %s, %s, %s, %s, %s)"
sql_data = tuple(i)
cursor.execute(insert_query, sql_data)
conn.commit()
except:
pass
conn.close()
print('----------------------완료----------------------')
# In[ ]: