반응형
DB 설계
CREATE TABLE [dbo].News
(
[nid] INT NOT NULL PRIMARY KEY IDENTITY,
[title] VARCHAR(200) NOT NULL,
[link] VARCHAR(200) NOT NULL,
[description] VARCHAR(MAX) NOT NULL,
[pubdate] DATETIME NOT NULL,
[mcnt] int NOT NULL
CONSTRAINT TU UNIQUE(link)
)
===================================================
CREATE TABLE [dbo].Morpheme
(
[mid] INT NOT NULL PRIMARY KEY IDENTITY,
[word] VARCHAR(50) NOT NULL,
[rcnt] INT NOT NULL,
CONSTRAINT WU UNIQUE(word)
)
===================================================
CREATE TABLE [dbo].Inverse
(
[nid] INT NOT NULL,
[mid] INT NOT NULL,
[rcnt] INT NOT NULL
)
관계 추가
News(nid) - Inverse(nid)
Morpheme(mid) - Inverse(mid)
관계 설정
EHHelper.py
#EHHelper.py
class EHHelper:
#src:"<b>음냐뤼, quot; 가나다quot;</b>
#RemoveTag
#src:"음냐뤼, quot; 가나다quot;"
#RemoveHtmlSpecialCh
#src:"음냐뤼, 가나다"
#RemoveSymbol
#src:"음냐뤼 가나다"
@staticmethod
def EmitTagAndSpecialCh(str):
str = EHHelper.RemoveTag(str)
str = EHHelper.RemoveHtmlSpecialCh(str)
str = EHHelper.RemoveSymbol(str)
return str
#태그를 제거
@staticmethod
def RemoveTag(src):
try:
while True:
s,e = EHHelper.FindTag(src)
if s<e:
src = src[:s]+src[e+1:] #태그를 제거하는 구문
else:#'>'가 '<'보다 앞에 있을 때
src = src[:e]+src[e+1:]#'>'만 제거하는 구문
except:# 더 이상 태그가 없음
return src
@staticmethod
def FindTag(src):
s = src.index('<')
e = src.index('>')
return s,e
@staticmethod
def RemoveSymbol(src):
dest=""
for elem in src:
if str.isalpha(elem) or str.isspace(elem):
dest += elem
return dest
@staticmethod
def RemoveHtmlSpecialCh(src):
try:
while True:
s,e = EHHelper.FindHtmlSpecialCh(src)
if s<e:
src = src[:s]+src[e+1:] #특수 문자를 제거
else:
src = src[:e]+src[e+1:]#';'만 제거
except:# 더 이상 특수 문자 없음
return src
@staticmethod
def FindHtmlSpecialCh(src):
s = src.index('&')
e = src.index(';')
return s,e
#mssql에 한글 문자열을 파이썬에 한글 문자열로 변환
@staticmethod
def MssqlstrToStrKor(src):
src = src.encode('ISO-8859-1')
src = src.decode('euc-kr')
return src
InverseSql.py
#InverseSql.py
import pymssql
class InverseSql:
@staticmethod
def AddInverseItem(nid,mid,rcnt):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
s_pre = "Insert into Inverse(nid,mid,rcnt) values"
s_post = str.format("({0},{1},{2})",nid,mid,rcnt)
query = s_pre+s_post
try:
cursor.execute(query)
conn.commit()
except:
temp=""
conn.close()
@staticmethod
def FindInv(mid):
inv_col = list()
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = str.format("Select nid,rcnt from Inverse where mid={0}",mid)
cursor.execute(query)
row = cursor.fetchone()
while row:
inv_col.append(row)
row = cursor.fetchone()
conn.close()
return inv_col
Morpheme.py
#Morpheme.py - 형태소 클래스
class Morpheme:
def __init__(self,word):
self.word = word #단어
self.ref = 1 #참조 개수
def Merge(self,other): # 병합하기
if self.IsEqual(other):
self.ref = self.ref + other.ref
def IsEqual(self,other): #같은 단어를 갖는 형태소인지 판별
return self.word == other.word
MorphemeParser.py
#MorphemeParser.py - 형태소 분석기
from Morpheme import Morpheme
from EHHelper import EHHelper
class MorphemeParser:
@staticmethod
def Parse(src):
morphes = list()
#원본 문자열에 특수 기호를 제거 및 공백 기준으로 분리
src = EHHelper.EmitTagAndSpecialCh(src)
msrc = src.split(' ')
#각 단어를 형태소 컬렉션에 추가
for elem in msrc:
if str.isalpha(elem):
morphes.append(Morpheme(elem))
#중복 형태소를 합치는 공정
morphes = MorphemeParser.Merge(morphes)
return morphes
@staticmethod
def Merge(morphes):
remoes = list()#병합한 형태소를 보관할 컬렉션
for morph in morphes:#원본 컬렉션에 있는 각각의 형태소를
rcnt = len(remoes)#병합한 컬렉션에 형태소 개수 구하기
flag = False #morph와 같은 단어가 remoes에 없다고 가정
#morph가 remoes컬렉션에 있다면 병합
for index in range(0,rcnt):
if remoes[index].word == morph.word:
remoes[index].Merge(morph)
flag = True#병합하였음을 마킹
break
if flag == False:#morph와 같은 단어는 remoes에 없음
remoes.append(morph)
return remoes
MorphemeSql.py
#MorphemeSql.py
import pymssql
class MorphemeSql:
@staticmethod
def AddMorpheme(mo):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = str.format("Insert into Morpheme(word) values('{0}')",mo.word)
try:
cursor.execute(query)
conn.commit()
except:
temp=""
conn.close()
@staticmethod
def FindMid(word):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = str.format("select mid from Morpheme where (word='{0}')",word)
cursor.execute(query)
row = cursor.fetchone()
conn.commit()
conn.close()
if row:
return row[0]
return 0
@staticmethod
def ListMorpheme():
moes = list()
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = str.format("select word from Morpheme")
cursor.execute(query)
row = cursor.fetchone()
while row:
moes.append(row[0])
row = cursor.fetchone()
conn.close()
return moes
NAWidget.py
#NAWidget.py - 뉴스 분석 Widget
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from NewsSql import NewsSql
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
from EHHelper import EHHelper
class NAWidget(QDialog):
def __init__(self,pa):
super().__init__(pa)
self.resize(2000,1000)
lb_mo = QLabel("형태소 목록",self)
lb_mo.move(20,20)
self.lbox_mo = QListWidget(self)
self.lbox_mo.move(20,100)
self.lbox_mo.resize(400,880)
lb_news = QLabel("포함 뉴스 목록",self)
lb_news.move(440,20)
self.tb_news = QTableWidget(self)
self.tb_news.move(440,100)
self.tb_news.resize(1540,880)
self.tb_news.setColumnCount(4)
self.tb_news.setColumnWidth(0,self.tb_news.width()/5)
self.tb_news.setColumnWidth(1,self.tb_news.width()*2/5)
self.tb_news.setColumnWidth(2,self.tb_news.width()/6)
self.tb_news.setColumnWidth(3,self.tb_news.width()/6)
self.tb_news.setHorizontalHeaderLabels\
(["제목","링크","전체 개수","참조 개수"])
self.lbox_mo.currentRowChanged.connect(self.OnSelectNews)
self.ListMorpheme()
def ListMorpheme(self):
temp = "지금 합시다."
moes = MorphemeSql.ListMorpheme()
for mo in moes:
mo = EHHelper.MssqlstrToStrKor(mo)
self.lbox_mo.addItem(mo)
def OnSelectNews(self):
item = self.lbox_mo.currentItem()
word = item.text()
self.tb_news.setRowCount(0)#테이블 내용 모두 삭제
mid = MorphemeSql.FindMid(word)
ins = InverseSql.FindInv(mid)
self.tb_news.setRowCount(len(ins))
for ri in range(0,len(ins)):
nid,rcnt = ins[ri]
news = NewsSql.FindNewsByNid(nid)
title,link,description,pubdate,mcnt = news
title = EHHelper.MssqlstrToStrKor(title)
self.tb_news.setCellWidget(ri,0,QLabel(title))
self.tb_news.setCellWidget(ri,1,QLabel(link))
self.tb_news.setCellWidget(ri,2,QLabel(str(mcnt)))
self.tb_news.setCellWidget(ri,3,QLabel(str(rcnt)))
self.tb_news.setRowHeight(ri,70)
NCWindow.py
#NCWindow.py
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from NewsAnaylizer import NewsAnaylizer
from NewsSql import NewsSql
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
import _thread
from NAWidget import NAWidget
from NSDialog import NSDialog
def Analyze(datas):
for data in datas:
news,m1,m2 = data
news.PreProcess()
NewsSql.AddNews(news)
NewsSql.UpdateMCnt(news.link,len(m1)+len(m2))
nid = NewsSql.FindNid(news.link)
for mo in m1:
MorphemeSql.AddMorpheme(mo)
mid = MorphemeSql.FindMid(mo.word)
InverseSql.AddInverseItem(nid,mid,mo.ref)
for mo in m2:
MorphemeSql.AddMorpheme(mo)
mid = MorphemeSql.FindMid(mo.word)
InverseSql.AddInverseItem(nid,mid,mo.ref)
class NCWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("수집기(feat.네이버 개발자 센터)")
self.na = NewsAnaylizer()
self.resize(2000,1000)
self.te_query = QTextEdit(self)
self.te_query.move(20,20)
self.te_query.resize(800,60)
self.btn_search = QPushButton("검색",self)
self.btn_search.move(840,20)
self.btn_search.resize(200,60)
self.tb_news = QTableWidget(self)
self.tb_news.move(20,100)
self.tb_news.resize(1960,880)
self.tb_news.setColumnCount(3)
self.tb_news.setColumnWidth(0,self.tb_news.width()*2/10)
self.tb_news.setColumnWidth(1,self.tb_news.width()*6/10)
self.tb_news.setColumnWidth(2,self.tb_news.width()*2/10-10)
self.tb_news.setHorizontalHeaderLabels(["제목","주소","단어 개수"])
self.btn_search.clicked.connect(self.OnSearch)
self.btn_analyze = QPushButton("분석", self)
self.btn_analyze.move(1760,20)
self.btn_analyze.resize(200,60)
self.btn_analyze.clicked.connect(self.OnAnalyze)
self.btn_view_sdlg = QPushButton("검색기 띄우기",self)
self.btn_view_sdlg.move(1400,20)
self.btn_view_sdlg.resize(300,60)
self.btn_view_sdlg.clicked.connect(self.OnShowSearchDlg)
def OnShowSearchDlg(self):
nsd = NSDialog(self)
nsd.show()
def OnAnalyze(self):
naw = NAWidget(self)
naw.show()
def OnSearch(self):
self.tb_news.setRowCount(0)
query = self.te_query.toPlainText()
datas = self.na.Analize(query)
self.tb_news.setRowCount(len(datas))
for ri in range(0,len(datas)):
news ,mo1, mo2 = datas[ri]
mcnt = len(mo1)+len(mo2)
self.tb_news.setCellWidget(ri,0,QLabel(news.title))
self.tb_news.setCellWidget(ri,1,QLabel(news.link))
self.tb_news.setCellWidget(ri,2,QLabel(str(mcnt)))
self.tb_news.setRowHeight(ri,60)
_thread.start_new_thread(Analyze,(datas,))
#Analyze(datas)
News.py
#News.py
from MorphemeParser import MorphemeParser
from EHHelper import EHHelper
import datetime
class News:
def __init__(self,title,link,description,pubdate):
self.title = title
self.link = link
self.description = description
self.pubdate = pubdate
def PreProcess(self):
self.title = EHHelper.EmitTagAndSpecialCh(self.title)
self.description = EHHelper.EmitTagAndSpecialCh(self.description)
@staticmethod
def MakeNews(jnews):
title = jnews['title']
link = jnews['link']
description = jnews['description']
pubdate = jnews['pubDate']
try:
index = pubdate.rfind('+')
pubdate = pubdate[:index]
dt = datetime.datetime.strptime(src,"%a, %d %b %Y %H:%M:%S ")
except:
pubdate = datetime.datetime.now()
else:
pubdate = dt
return News(title,link,description,pubdate)
NewsAnaylizer.py
#NewsAnaylizer.py - 뉴스 분석기
from NewsSearcher import NewsSearcher
from MorphemeParser import MorphemeParser
from Morpheme import Morpheme
from News import News
class NewsAnaylizer:
def __init__(self):
self.ns = NewsSearcher()
def Analize(self, query):
redata = list()
self.ns.SetQuery(query)
news_col = self.ns.RequestAll()
#news_col,total = self.ns.Request(1,10)
for jnews in news_col:
news = News.MakeNews(jnews)
morphes1 = MorphemeParser.Parse(news.title)
morphes2 = MorphemeParser.Parse(news.description)
redata.append([news,morphes1,morphes2])
return redata
NewsSearcher.py
#NewsSearcher.py - 뉴스 검색기
from MorphemeParser import MorphemeParser
from Morpheme import Morpheme
import urllib.request
import json
class NewsSearcher:
def __init__(self):
self.client_id ="N3yLQ95_i8KdsL3tJAJ3"
self.client_secret="99b7XrrXp3"
self.url = "https://openapi.naver.com/v1/search/news.json"
def SetQuery(self,query):
query = urllib.parse.quote(query)
self.qp = "query="+query
def Request(self,start,display):
sp = "start="+str(start)
dp = "display="+str(display)
query_str = self.url+"?"+self.qp+"&"+sp+"&"+dp
request = urllib.request.Request(query_str)
request.add_header("X-Naver-Client-Id",self.client_id)
request.add_header("X-Naver-Client-Secret",self.client_secret)
try:
response = urllib.request.urlopen(request)
except: #예외 발생하였을 때
return list(),0
if response.getcode()!=200:#실패일 때
return list(),0
content = response.read()
content = content.decode('utf-8')
jdata = json.loads(content)
total = int(jdata['total'])
return jdata['items'],total
def RequestAll(self):
start = 1
display = 100
redatas = list()
datas, total = self.Request(start,display)
redatas.extend(datas)
start = start + display
while start<total and start<1000:
datas, total = self.Request(start,display)
redatas.extend(datas)
start = start + display
return redatas
NewsSql.py
#NewsSql.py
from News import News
import pymssql
class NewsSql:
@staticmethod
def AddNews(news):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
dstr = news.pubdate.strftime('%Y-%m-%d %H:%M:%S')
q_pre = "insert into News (title,description,link,pubdate,mcnt)"
q_post = str.format("values('{0}','{1}','{2}','{3}',{4})",\
news.title, news.description,news.link,dstr,0)
query = str.format("{0} {1}",q_pre,q_post)
try:
cursor.execute(query)
conn.commit()
except:
temp=""
conn.close()
@staticmethod
def UpdateMCnt(link,mcnt):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = str.format("update News set mcnt={0} where (link='{1}')",mcnt,link)
cursor.execute(query)
conn.commit()
conn.close()
@staticmethod
def FindNid(link):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = str.format("select nid from News where (link='{0}')",link)
cursor.execute(query)
row = cursor.fetchone()
conn.commit()
conn.close()
return row[0]
@staticmethod
def FindNewsByNid(nid):
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
q_pre = "select title,link,description,pubdate,mcnt from News "
q_post = str.format(" where (nid={0})",nid)
query = q_pre + q_post
cursor.execute(query)
row = cursor.fetchone()
conn.close()
return row
@staticmethod
def TotalDocumentCount():
conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
cursor = conn.cursor()
query = "select count(*) from News"
cursor.execute(query)
row = cursor.fetchone()
conn.close()
return row[0]
NSDialog.py
#NSDialog.py - 뉴스 검색 대화상자
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from NewsSql import NewsSql
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
from EHHelper import EHHelper
from MorphemeParser import MorphemeParser
from ScoredNews import ScoredNews
from News import News
import math
class NSDialog(QDialog):
def __init__(self,pa):
super().__init__(pa)
self.resize(2000,1000)
self.setWindowTitle("뉴스 검색기")
self.te_query = QTextEdit(self)
self.te_query.move(20,20)
self.te_query.resize(800,60)
self.btn_search = QPushButton("검색",self)
self.btn_search.move(840,20)
self.btn_search.resize(200,60)
self.lbox_news = QListWidget(self)
self.lbox_news.move(20,90)
self.lbox_news.resize(600,900)
lb_d1 = QLabel("제목:",self)
lb_d1.move(640,100)
lb_d2 = QLabel("링크",self)
lb_d2.move(640,180)
lb_d3 = QLabel("점수:",self)
lb_d3.move(640,260)
self.te_title = QTextEdit("",self)
self.te_title.setReadOnly(True)
self.te_title.move(740,100)
self.te_title.resize(1240,60)
self.te_link = QTextEdit("",self)
self.te_link.setReadOnly(True)
self.te_link.move(740,180)
self.te_link.resize(1240,60)
self.te_score = QTextEdit("",self)
self.te_score.setReadOnly(True)
self.te_score.move(740,260)
self.te_score.resize(1240,60)
self.te_description = QTextEdit("",self)
self.te_description.setReadOnly(True)
self.te_description.move(640,340)
self.te_description.resize(1340,650)
self.btn_search.clicked.connect(self.OnSearch)
self.lbox_news.currentRowChanged.connect(self.OnSelectChange)
self.sns = list()
def OnSearch(self):
self.sns.clear()
query = self.te_query.toPlainText()
tdcnt = NewsSql.TotalDocumentCount()
moes = MorphemeParser.Parse(query)
for mo in moes:
mid = MorphemeSql.FindMid(mo.word)
ins = InverseSql.FindInv(mid)
idf = tdcnt/(len(ins)+1)
for ri in range(0,len(ins)):
nid,rcnt = ins[ri]
news = NewsSql.FindNewsByNid(nid)
title,link,description,pubdate,mcnt = news
title = EHHelper.MssqlstrToStrKor(title)
description = EHHelper.MssqlstrToStrKor(description)
tf = rcnt/mcnt
score = tf*math.log(idf)
sn = ScoredNews(News(title,link,description,pubdate),score)
self.sns.append(sn)
self.sns = NSDialog.MergeDupSns(self.sns)
for sn in self.sns:
self.lbox_news.addItem(sn.news.title)
def OnSelectChange(self):
if len(self.sns) == 0:
return
index = self.lbox_news.currentRow()
sn = self.sns[index]
self.te_title.setText(sn.news.title)
self.te_link.setText(sn.news.link)
self.te_description.setText(sn.news.description)
self.te_score.setText(str(sn.score))
@staticmethod
def MergeDupSns(sns):
res = list()
for sn in sns:
flag = False
for i in range(0,len(res)):
rsn = res[i]
if(rsn.news.link == sn.news.link):
rsn.score += sn.score
flag = True
break
if flag == False:
res.append(sn)
res = sorted(res)
return res
ScoredNews.py
#ScoredNews.py
from News import News
class ScoredNews:
def __init__(self,news,score):
self.news = news
self.score = score
def __lt__(self,other):
return self.score>other.score
Main.py
import sys
from PyQt5.QtWidgets import QApplication
from NCWindow import NCWindow
app = QApplication(sys.argv)
ncw = NCWindow()
ncw.show()
sys.exit(app.exec_())
반응형
'언어 자료구조 알고리즘 > 파이썬(Python)' 카테고리의 다른 글
[python] 뉴스 검색기V03 feat. 네이버 개발자센터, 형태소 분석, MSSQL (0) | 2020.11.11 |
---|---|
[ python] 뉴스 분석기 feat.네이버 개발자센터 , 형태소 분석 (0) | 2020.11.10 |
[python] 뉴스 검색 - 형태소 분석 (feat. 네이버 개발자센터) (0) | 2020.11.10 |
[python] 도서 검색기 feat. 네이버 개발자센터 (0) | 2020.11.09 |
[python] 네이버 도서 검색 API 활용 - Json (0) | 2020.11.09 |
파이썬에서 별도의 선택문은 없어요. 대신 elif를 이용하세요. (0) | 2020.10.22 |
[python] 13. 리스트의 요소 개수 알아내기 및 정렬하기 (3) | 2016.05.31 |
[python] 12. 리스트에서 자료 삭제하기 (0) | 2016.05.31 |
[python] 11. 리스트에 자료를 추가하기 (0) | 2016.05.23 |
[python] 10. 파이썬을 잘 사용하기 위한 첫 걸음, 리스트를 소개합니다. (0) | 2016.05.20 |