언어 자료구조 알고리즘/파이썬(Python)

[python] 뉴스 검색기V04 feat.네이버 개발자센터

언제나휴일 2020. 11. 11. 16:37
반응형

DB 설계

CREATE TABLE [dbo].News
(
    [nid] INT NOT NULL PRIMARY KEY IDENTITY, 
    [title] VARCHAR(200) NOT NULL, 
    [link] VARCHAR(200) NOT NULL, 
    [description] VARCHAR(MAX) NOT NULL, 
    [pubdate] DATETIME NOT NULL,
    [mcnt] int NOT NULL
    CONSTRAINT TU UNIQUE(link)
)
===================================================
CREATE TABLE [dbo].Morpheme
(
    [mid] INT NOT NULL PRIMARY KEY IDENTITY, 
    [word] VARCHAR(50) NOT NULL, 
    [rcnt] INT NOT NULL,
    CONSTRAINT WU UNIQUE(word)
)
===================================================
CREATE TABLE [dbo].Inverse
(
    [nid] INT NOT NULL, 
    [mid] INT NOT NULL, 
    [rcnt] INT NOT NULL 
)

관계 추가
News(nid) - Inverse(nid)
Morpheme(mid) - Inverse(mid)

관계 설정

News 테이블의 nid와 Inverse 테이블의 nid
Morpheme 테이블의 mid 와 Inverse 테이블의 mid

 

 

 

EHHelper.py

EHHelper.py
0.00MB

#EHHelper.py
class EHHelper:
    #src:"<b>음냐뤼, quot; 가나다quot;</b>
    #RemoveTag
    #src:"음냐뤼, quot; 가나다quot;"
    #RemoveHtmlSpecialCh
    #src:"음냐뤼, 가나다"
    #RemoveSymbol
    #src:"음냐뤼 가나다"
    @staticmethod
    def EmitTagAndSpecialCh(str):
        str = EHHelper.RemoveTag(str)
        str = EHHelper.RemoveHtmlSpecialCh(str)
        str = EHHelper.RemoveSymbol(str)
        return str
    #태그를 제거
    @staticmethod
    def RemoveTag(src):
        try:
            while True:
                s,e = EHHelper.FindTag(src)
                if s<e:
                    src = src[:s]+src[e+1:] #태그를 제거하는 구문
                else:#'>'가 '<'보다 앞에 있을 때
                    src = src[:e]+src[e+1:]#'>'만 제거하는 구문
        except:# 더 이상 태그가 없음
            return src
    @staticmethod
    def FindTag(src):
        s = src.index('<')
        e = src.index('>')
        return s,e
    @staticmethod
    def RemoveSymbol(src):
        dest=""
        for elem in src:
            if str.isalpha(elem) or str.isspace(elem):
                dest += elem
        return dest
    @staticmethod
    def RemoveHtmlSpecialCh(src):
        try:
            while True:
                s,e = EHHelper.FindHtmlSpecialCh(src)
                if s<e:
                    src = src[:s]+src[e+1:] #특수 문자를 제거
                else:
                    src = src[:e]+src[e+1:]#';'만 제거
        except:# 더 이상 특수 문자 없음
            return src
    @staticmethod
    def FindHtmlSpecialCh(src):
        s = src.index('&')
        e = src.index(';')
        return s,e
    #mssql에 한글 문자열을 파이썬에 한글 문자열로 변환
    @staticmethod
    def MssqlstrToStrKor(src):
        src = src.encode('ISO-8859-1')
        src = src.decode('euc-kr')
        return src

 

InverseSql.py

InverseSql.py
0.00MB

#InverseSql.py
import pymssql
class InverseSql:
    @staticmethod
    def AddInverseItem(nid,mid,rcnt):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        s_pre = "Insert into Inverse(nid,mid,rcnt) values"
        s_post = str.format("({0},{1},{2})",nid,mid,rcnt)
        query = s_pre+s_post
        try:
            cursor.execute(query)
            conn.commit()
        except:
            temp=""
        conn.close()
    @staticmethod
    def FindInv(mid):
        inv_col = list()
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        query = str.format("Select nid,rcnt from Inverse where mid={0}",mid)
        cursor.execute(query)
        row = cursor.fetchone()  
        while row:
            inv_col.append(row)
            row = cursor.fetchone()
        conn.close()
        return inv_col

Morpheme.py

#Morpheme.py  - 형태소 클래스
class Morpheme:
    def __init__(self,word):
        self.word = word #단어 
        self.ref = 1           #참조 개수
    def Merge(self,other): # 병합하기
        if self.IsEqual(other):
            self.ref = self.ref + other.ref
    def IsEqual(self,other): #같은 단어를 갖는 형태소인지 판별
        return self.word ==  other.word

MorphemeParser.py

MorphemeParser.py
0.00MB

#MorphemeParser.py - 형태소 분석기
from Morpheme import Morpheme
from EHHelper import EHHelper
class MorphemeParser:
    @staticmethod
    def Parse(src):
        morphes = list() 
        #원본 문자열에 특수 기호를 제거 및 공백 기준으로 분리
        src = EHHelper.EmitTagAndSpecialCh(src)
        msrc = src.split(' ')
        #각 단어를 형태소 컬렉션에 추가
        for elem in msrc:
            if str.isalpha(elem):
                morphes.append(Morpheme(elem))
        #중복 형태소를 합치는 공정
        morphes = MorphemeParser.Merge(morphes)
        return morphes

    @staticmethod
    def Merge(morphes):
        remoes = list()#병합한 형태소를 보관할 컬렉션
        for morph in morphes:#원본 컬렉션에 있는 각각의 형태소를
            rcnt = len(remoes)#병합한 컬렉션에 형태소 개수 구하기
            flag = False #morph와 같은 단어가 remoes에 없다고 가정
            #morph가 remoes컬렉션에 있다면 병합
            for index in range(0,rcnt):
                if remoes[index].word == morph.word:
                    remoes[index].Merge(morph)
                    flag = True#병합하였음을 마킹
                    break            
            if flag == False:#morph와 같은 단어는 remoes에 없음
                remoes.append(morph)
        return remoes

MorphemeSql.py

MorphemeSql.py
0.00MB

#MorphemeSql.py
import pymssql
class MorphemeSql:
    @staticmethod
    def AddMorpheme(mo):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        query = str.format("Insert into Morpheme(word) values('{0}')",mo.word)
        try:
            cursor.execute(query)
            conn.commit()
        except:
            temp=""
        conn.close()
    @staticmethod
    def FindMid(word):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("select mid from Morpheme where (word='{0}')",word)
        cursor.execute(query)
        row = cursor.fetchone()
        conn.commit()
        conn.close()
        if row:
            return row[0]
        return 0
    @staticmethod
    def ListMorpheme():
        moes = list()
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("select word from Morpheme")
        cursor.execute(query)
        row = cursor.fetchone()  
        while row:
            moes.append(row[0]) 
            row = cursor.fetchone()
        conn.close()
        return moes

NAWidget.py

NAWidget.py
0.00MB

#NAWidget.py - 뉴스 분석 Widget
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from NewsSql import NewsSql
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
from EHHelper import EHHelper
class NAWidget(QDialog):
    def __init__(self,pa):
        super().__init__(pa)
        self.resize(2000,1000)
        lb_mo = QLabel("형태소 목록",self)
        lb_mo.move(20,20)
        self.lbox_mo = QListWidget(self)
        self.lbox_mo.move(20,100)
        self.lbox_mo.resize(400,880)
        lb_news = QLabel("포함 뉴스 목록",self)
        lb_news.move(440,20)
        self.tb_news = QTableWidget(self)
        self.tb_news.move(440,100)
        self.tb_news.resize(1540,880)
        self.tb_news.setColumnCount(4)
        self.tb_news.setColumnWidth(0,self.tb_news.width()/5)
        self.tb_news.setColumnWidth(1,self.tb_news.width()*2/5)
        self.tb_news.setColumnWidth(2,self.tb_news.width()/6)
        self.tb_news.setColumnWidth(3,self.tb_news.width()/6)
        self.tb_news.setHorizontalHeaderLabels\
            (["제목","링크","전체 개수","참조 개수"])
        self.lbox_mo.currentRowChanged.connect(self.OnSelectNews)
        self.ListMorpheme()
    def ListMorpheme(self):
        temp = "지금 합시다."
        moes = MorphemeSql.ListMorpheme()
        for mo in moes:
            mo = EHHelper.MssqlstrToStrKor(mo)
            self.lbox_mo.addItem(mo)
    def OnSelectNews(self):
        item = self.lbox_mo.currentItem()
        word = item.text()
        self.tb_news.setRowCount(0)#테이블 내용 모두 삭제
        mid = MorphemeSql.FindMid(word)
        ins = InverseSql.FindInv(mid)
        self.tb_news.setRowCount(len(ins))
        for ri in range(0,len(ins)):
            nid,rcnt = ins[ri]
            news = NewsSql.FindNewsByNid(nid)
            title,link,description,pubdate,mcnt = news
            title = EHHelper.MssqlstrToStrKor(title)
            self.tb_news.setCellWidget(ri,0,QLabel(title))
            self.tb_news.setCellWidget(ri,1,QLabel(link))
            self.tb_news.setCellWidget(ri,2,QLabel(str(mcnt)))
            self.tb_news.setCellWidget(ri,3,QLabel(str(rcnt)))
            self.tb_news.setRowHeight(ri,70)


NCWindow.py

NCWindow.py
0.00MB

#NCWindow.py
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from NewsAnaylizer import NewsAnaylizer
from NewsSql import NewsSql
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
import _thread
from NAWidget import NAWidget
from NSDialog import NSDialog
def Analyze(datas):
    for data in datas:
        news,m1,m2 = data
        news.PreProcess()
        NewsSql.AddNews(news)
        NewsSql.UpdateMCnt(news.link,len(m1)+len(m2))
        nid = NewsSql.FindNid(news.link)
        for mo in m1:
            MorphemeSql.AddMorpheme(mo)
            mid = MorphemeSql.FindMid(mo.word)
            InverseSql.AddInverseItem(nid,mid,mo.ref)
        for mo in m2:
            MorphemeSql.AddMorpheme(mo)
            mid = MorphemeSql.FindMid(mo.word)
            InverseSql.AddInverseItem(nid,mid,mo.ref)

class NCWindow(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("수집기(feat.네이버 개발자 센터)")
        self.na = NewsAnaylizer()
        self.resize(2000,1000)
        self.te_query = QTextEdit(self)
        self.te_query.move(20,20)
        self.te_query.resize(800,60)
        self.btn_search = QPushButton("검색",self)
        self.btn_search.move(840,20)
        self.btn_search.resize(200,60)
        self.tb_news = QTableWidget(self)
        self.tb_news.move(20,100)
        self.tb_news.resize(1960,880)
        self.tb_news.setColumnCount(3)
        self.tb_news.setColumnWidth(0,self.tb_news.width()*2/10)
        self.tb_news.setColumnWidth(1,self.tb_news.width()*6/10)
        self.tb_news.setColumnWidth(2,self.tb_news.width()*2/10-10)
        self.tb_news.setHorizontalHeaderLabels(["제목","주소","단어 개수"])
        self.btn_search.clicked.connect(self.OnSearch)
        self.btn_analyze = QPushButton("분석", self)
        self.btn_analyze.move(1760,20)
        self.btn_analyze.resize(200,60)
        self.btn_analyze.clicked.connect(self.OnAnalyze)
        self.btn_view_sdlg = QPushButton("검색기 띄우기",self)
        self.btn_view_sdlg.move(1400,20)
        self.btn_view_sdlg.resize(300,60)
        self.btn_view_sdlg.clicked.connect(self.OnShowSearchDlg)
    def OnShowSearchDlg(self):
        nsd = NSDialog(self)
        nsd.show()
    def OnAnalyze(self):
        naw = NAWidget(self)
        naw.show()
    def OnSearch(self):
        self.tb_news.setRowCount(0)
        query = self.te_query.toPlainText()
        datas =  self.na.Analize(query)
        self.tb_news.setRowCount(len(datas))
        for ri in range(0,len(datas)):
            news ,mo1, mo2 = datas[ri]
            mcnt = len(mo1)+len(mo2)
            self.tb_news.setCellWidget(ri,0,QLabel(news.title))
            self.tb_news.setCellWidget(ri,1,QLabel(news.link))
            self.tb_news.setCellWidget(ri,2,QLabel(str(mcnt)))
            self.tb_news.setRowHeight(ri,60)
        _thread.start_new_thread(Analyze,(datas,))
        #Analyze(datas)

News.py

News.py
0.00MB

#News.py
from MorphemeParser import MorphemeParser
from EHHelper import EHHelper
import datetime
class News:
    def __init__(self,title,link,description,pubdate):
        self.title = title
        self.link = link
        self.description = description
        self.pubdate = pubdate
    def PreProcess(self):
        self.title = EHHelper.EmitTagAndSpecialCh(self.title)
        self.description = EHHelper.EmitTagAndSpecialCh(self.description)
    @staticmethod
    def MakeNews(jnews):
        title = jnews['title']
        link = jnews['link']
        description = jnews['description']
        pubdate = jnews['pubDate']
        try:
            index = pubdate.rfind('+')
            pubdate = pubdate[:index]
            dt = datetime.datetime.strptime(src,"%a, %d %b %Y %H:%M:%S ")
        except:
            pubdate = datetime.datetime.now()
        else:
            pubdate = dt
        return News(title,link,description,pubdate)

NewsAnaylizer.py

NewsAnaylizer.py
0.00MB

#NewsAnaylizer.py - 뉴스 분석기
from NewsSearcher import NewsSearcher
from MorphemeParser import MorphemeParser
from Morpheme import Morpheme
from News import News
class NewsAnaylizer:
    def __init__(self):
        self.ns = NewsSearcher()
    def Analize(self, query):
        redata = list()
        self.ns.SetQuery(query)
        news_col = self.ns.RequestAll()
        #news_col,total = self.ns.Request(1,10)
        for jnews in news_col:
            news = News.MakeNews(jnews)
            morphes1 = MorphemeParser.Parse(news.title)
            morphes2 = MorphemeParser.Parse(news.description)
            redata.append([news,morphes1,morphes2])
        return redata

NewsSearcher.py

NewsSearcher.py
0.00MB

#NewsSearcher.py - 뉴스 검색기
from MorphemeParser import MorphemeParser
from Morpheme import Morpheme
import urllib.request
import json
class NewsSearcher:
    def __init__(self):
        self.client_id ="N3yLQ95_i8KdsL3tJAJ3"
        self.client_secret="99b7XrrXp3"
        self.url = "https://openapi.naver.com/v1/search/news.json"
    def SetQuery(self,query):
        query = urllib.parse.quote(query)
        self.qp = "query="+query
    def Request(self,start,display):
        sp = "start="+str(start)
        dp = "display="+str(display)
        query_str = self.url+"?"+self.qp+"&"+sp+"&"+dp
        request = urllib.request.Request(query_str)
        request.add_header("X-Naver-Client-Id",self.client_id)
        request.add_header("X-Naver-Client-Secret",self.client_secret)
        try:
            response = urllib.request.urlopen(request)
        except: #예외 발생하였을 때
            return list(),0
        if response.getcode()!=200:#실패일 때
            return list(),0
        content = response.read()
        content = content.decode('utf-8')
        jdata = json.loads(content)
        total = int(jdata['total'])
        return jdata['items'],total
    def RequestAll(self):
        start = 1
        display = 100
        redatas = list()
        datas, total = self.Request(start,display)
        redatas.extend(datas)
        start = start + display
        while start<total and start<1000:
            datas, total = self.Request(start,display)
            redatas.extend(datas)
            start = start + display
        return redatas

NewsSql.py

NewsSql.py
0.00MB

#NewsSql.py
from News import News
import pymssql
class NewsSql:
    @staticmethod
    def AddNews(news):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        dstr = news.pubdate.strftime('%Y-%m-%d %H:%M:%S')
        q_pre = "insert into News (title,description,link,pubdate,mcnt)"
        q_post = str.format("values('{0}','{1}','{2}','{3}',{4})",\
            news.title, news.description,news.link,dstr,0)
        query = str.format("{0}      {1}",q_pre,q_post)
        try:
            cursor.execute(query)
            conn.commit()
        except:
            temp=""
        conn.close()
    @staticmethod
    def UpdateMCnt(link,mcnt):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("update News set mcnt={0} where (link='{1}')",mcnt,link)
        cursor.execute(query)
        conn.commit()
        conn.close()
    @staticmethod
    def FindNid(link):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("select nid from News where (link='{0}')",link)
        cursor.execute(query)
        row = cursor.fetchone()
        conn.commit()
        conn.close()
        return row[0]
    @staticmethod
    def FindNewsByNid(nid):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        q_pre = "select title,link,description,pubdate,mcnt from News "
        q_post = str.format("   where (nid={0})",nid)
        query = q_pre + q_post
        cursor.execute(query)
        row = cursor.fetchone()
        conn.close()
        return row
    @staticmethod
    def TotalDocumentCount():
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = "select count(*) from News"
        cursor.execute(query)
        row = cursor.fetchone()
        conn.close()
        return row[0]

NSDialog.py

NSDialog.py
0.00MB

#NSDialog.py - 뉴스 검색 대화상자
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from NewsSql import NewsSql
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
from EHHelper import EHHelper
from MorphemeParser import MorphemeParser
from ScoredNews import ScoredNews
from News import News
import math
class NSDialog(QDialog):
    def __init__(self,pa):
        super().__init__(pa)
        self.resize(2000,1000)
        self.setWindowTitle("뉴스 검색기")
        self.te_query = QTextEdit(self)
        self.te_query.move(20,20)
        self.te_query.resize(800,60)
        self.btn_search = QPushButton("검색",self)
        self.btn_search.move(840,20)
        self.btn_search.resize(200,60)
        self.lbox_news = QListWidget(self)
        self.lbox_news.move(20,90)
        self.lbox_news.resize(600,900)
        lb_d1 = QLabel("제목:",self)
        lb_d1.move(640,100)
        lb_d2 = QLabel("링크",self)
        lb_d2.move(640,180)
        lb_d3 = QLabel("점수:",self)
        lb_d3.move(640,260)
        self.te_title = QTextEdit("",self)
        self.te_title.setReadOnly(True)
        self.te_title.move(740,100)
        self.te_title.resize(1240,60)
        self.te_link = QTextEdit("",self)
        self.te_link.setReadOnly(True)
        self.te_link.move(740,180)
        self.te_link.resize(1240,60)
        self.te_score = QTextEdit("",self)
        self.te_score.setReadOnly(True)
        self.te_score.move(740,260)
        self.te_score.resize(1240,60)
        self.te_description = QTextEdit("",self)
        self.te_description.setReadOnly(True)
        self.te_description.move(640,340)
        self.te_description.resize(1340,650)
        self.btn_search.clicked.connect(self.OnSearch)
        self.lbox_news.currentRowChanged.connect(self.OnSelectChange)
        self.sns = list()
    def OnSearch(self):
        self.sns.clear()
        query = self.te_query.toPlainText()
        tdcnt = NewsSql.TotalDocumentCount()
        moes = MorphemeParser.Parse(query)
        for mo in moes:
            mid = MorphemeSql.FindMid(mo.word)
            ins = InverseSql.FindInv(mid)
            idf = tdcnt/(len(ins)+1)
            for ri in range(0,len(ins)):
                nid,rcnt = ins[ri]
                news = NewsSql.FindNewsByNid(nid)
                title,link,description,pubdate,mcnt = news
                title = EHHelper.MssqlstrToStrKor(title)
                description = EHHelper.MssqlstrToStrKor(description)
                tf = rcnt/mcnt
                score = tf*math.log(idf)
                sn = ScoredNews(News(title,link,description,pubdate),score)
                self.sns.append(sn)
        self.sns = NSDialog.MergeDupSns(self.sns)
        for sn in self.sns:
            self.lbox_news.addItem(sn.news.title)
    def OnSelectChange(self):
        if len(self.sns) == 0:
            return
        index = self.lbox_news.currentRow()        
        sn = self.sns[index]
        self.te_title.setText(sn.news.title)
        self.te_link.setText(sn.news.link)
        self.te_description.setText(sn.news.description)
        self.te_score.setText(str(sn.score))
    @staticmethod 
    def MergeDupSns(sns):
        res = list()
        for sn in sns:
            flag = False
            for i in range(0,len(res)):
                rsn = res[i]
                if(rsn.news.link == sn.news.link):
                    rsn.score += sn.score
                    flag = True
                    break
            if flag == False:
                res.append(sn)
        res = sorted(res)
        return res

ScoredNews.py

ScoredNews.py
0.00MB

#ScoredNews.py
from News import News
class ScoredNews:
    def __init__(self,news,score):
        self.news = news
        self.score = score
    def __lt__(self,other):
        return self.score>other.score

Main.py

import sys
from PyQt5.QtWidgets import QApplication
from NCWindow import NCWindow
app = QApplication(sys.argv)
ncw = NCWindow()
ncw.show()
sys.exit(app.exec_())

 

반응형