언어 자료구조 알고리즘/파이썬(Python)

[python] 뉴스 검색기V03 feat. 네이버 개발자센터, 형태소 분석, MSSQL

언제나휴일 2020. 11. 11. 09:23
반응형

뉴스 테이블

CREATE TABLE [dbo].News
(
    [nid] INT NOT NULL PRIMARY KEY IDENTITY, 
    [title] VARCHAR(200) NOT NULL, 
    [link] VARCHAR(200) NOT NULL, 
    [description] VARCHAR(MAX) NOT NULL, 
    [pubdate] DATETIME NOT NULL,
    [mcnt] int NOT NULL
    CONSTRAINT TU UNIQUE(link)
)

형태소 테이블

CREATE TABLE [dbo].Morpheme
(
    [mid] INT NOT NULL PRIMARY KEY IDENTITY, 
    [word] VARCHAR(50) NOT NULL, 
    CONSTRAINT WU UNIQUE(word)
)

역참조 테이블(역파일)

CREATE TABLE [dbo].Inverse
(
    [nid] INT NOT NULL, 
    [mid] INT NOT NULL, 
    [rcnt] INT NOT NULL 
)

관계 추가

News 테이블의 nid컬럼과 Inverse 테이블의 nid 컬럼
Morpheme 테이블의 mid 컬럼과 Inverse 테이블의 mid 컬럼

News.py

News.py
0.00MB

#News.py
from MorphemeParser import MorphemeParser
from EHHelper import EHHelper
import datetime
class News:
    def __init__(self,title,link,description,pubdate):
        self.title = title
        self.link = link
        self.description = description
        self.pubdate = pubdate
    def PreProcess(self):
        self.title = EHHelper.EmitTagAndSpecialCh(self.title)
        self.description = EHHelper.EmitTagAndSpecialCh(self.description)
    @staticmethod
    def MakeNews(jnews):
        title = jnews['title']
        link = jnews['link']
        description = jnews['description']
        pubdate = jnews['pubDate']
        try:
            index = pubdate.rfind('+')
            pubdate = pubdate[:index]
            dt = datetime.datetime.strptime(src,"%a, %d %b %Y %H:%M:%S ")
        except:
            pubdate = datetime.datetime.now()
        else:
            pubdate = dt
        return News(title,link,description,pubdate)

 

Morpheme.py

Morpheme.py
0.00MB

 

 

#Morpheme.py  - 형태소 클래스
class Morpheme:
    def __init__(self,word):
        self.word = word #단어 
        self.ref = 1           #참조 개수
    def Merge(self,other): # 병합하기
        if self.IsEqual(other):
            self.ref = self.ref + other.ref
    def IsEqual(self,other): #같은 단어를 갖는 형태소인지 판별
        return self.word ==  other.word

NewsSql.py

NewsSql.py
0.00MB

#NewsSql.py
from News import News
import pymssql
class NewsSql:
    @staticmethod
    def AddNews(news):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        dstr = news.pubdate.strftime('%Y-%m-%d %H:%M:%S')
        q_pre = "insert into News (title,description,link,pubdate,mcnt)"
        q_post = str.format("values('{0}','{1}','{2}','{3}',{4})",\
            news.title, news.description,news.link,dstr,0)
        query = str.format("{0}      {1}",q_pre,q_post)
        try:
            cursor.execute(query)
            conn.commit()
        except:
            temp=""
        conn.close()
    @staticmethod
    def UpdateMCnt(link,mcnt):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("update News set mcnt={0} where (link='{1}')",mcnt,link)
        cursor.execute(query)
        conn.commit()
        conn.close()
    @staticmethod
    def FindNid(link):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("select nid from News where (link='{0}')",link)
        cursor.execute(query)
        row = cursor.fetchone()
        conn.commit()
        conn.close()
        return row[0]
    @staticmethod
    def FindNewsByNid(nid):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        q_pre = "select title,link,description,pubdate,mcnt from News "
        q_post = str.format("   where (nid={0})",nid)
        query = q_pre + q_post
        cursor.execute(query)
        row = cursor.fetchone()
        conn.close()
        return row
    @staticmethod
    def TotalDocumentCount():
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = "select count(*) from News"
        cursor.execute(query)
        row = cursor.fetchone()
        conn.close()
        return row[0]

MorphemeSql.py

MorphemeSql.py
0.00MB

#MorphemeSql.py
import pymssql
class MorphemeSql:
    @staticmethod
    def AddMorpheme(mo):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        query = str.format("Insert into Morpheme(word) values('{0}')",mo.word)
        try:
            cursor.execute(query)
            conn.commit()
        except:
            temp=""
        conn.close()
    @staticmethod
    def FindMid(word):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("select mid from Morpheme where (word='{0}')",word)
        cursor.execute(query)
        row = cursor.fetchone()
        conn.commit()
        conn.close()
        if row:
            return row[0]
        return 0
    @staticmethod
    def ListMorpheme():
        moes = list()
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()        
        query = str.format("select word from Morpheme")
        cursor.execute(query)
        row = cursor.fetchone()  
        while row:
            moes.append(row[0]) 
            row = cursor.fetchone()
        conn.close()
        return moes

 

InverseSql.py

InverseSql.py
0.00MB

 

#InverseSql.py
import pymssql
class InverseSql:
    @staticmethod
    def AddInverseItem(nid,mid,rcnt):
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        s_pre = "Insert into Inverse(nid,mid,rcnt) values"
        s_post = str.format("({0},{1},{2})",nid,mid,rcnt)
        query = s_pre+s_post
        try:
            cursor.execute(query)
            conn.commit()
        except:
            temp=""
        conn.close()
    @staticmethod
    def FindInv(mid):
        inv_col = list()
        conn = pymssql.connect("127.0.0.1:1433", "sa","1234","BigPro")
        cursor = conn.cursor()
        query = str.format("Select nid,rcnt from Inverse where mid={0}",mid)
        cursor.execute(query)
        row = cursor.fetchone()  
        while row:
            inv_col.append(row)
            row = cursor.fetchone()
        conn.close()
        return inv_col

NewsSearcher.py

NewsSearcher.py
0.00MB

 

 

#NewsSearcher.py - 뉴스 검색기
from MorphemeParser import MorphemeParser
from Morpheme import Morpheme
import urllib.request
import json
class NewsSearcher:
    def __init__(self):
        self.client_id ="네이버에서 제공한 클라이언트 ID"
        self.client_secret="네이버에서 제공한 클라이언트 Secret"
        self.url = "https://openapi.naver.com/v1/search/news.json"
    def SetQuery(self,query):
        query = urllib.parse.quote(query)
        self.qp = "query="+query
    def Request(self,start,display):
        sp = "start="+str(start)
        dp = "display="+str(display)
        query_str = self.url+"?"+self.qp+"&"+sp+"&"+dp
        request = urllib.request.Request(query_str)
        request.add_header("X-Naver-Client-Id",self.client_id)
        request.add_header("X-Naver-Client-Secret",self.client_secret)
        try:
            response = urllib.request.urlopen(request)
        except: #예외 발생하였을 때
            return list(),0
        if response.getcode()!=200:#실패일 때
            return list(),0
        content = response.read()
        content = content.decode('utf-8')
        jdata = json.loads(content)
        total = int(jdata['total'])
        return jdata['items'],total
    def RequestAll(self):
        start = 1
        display = 100
        redatas = list()
        datas, total = self.Request(start,display)
        redatas.extend(datas)
        start = start + display
        while start<total and start<1000:
            datas, total = self.Request(start,display)
            redatas.extend(datas)
            start = start + display
        return redatas

MorphemeParser.py

 

MorphemeParser.py
0.00MB

#MorphemeParser.py - 형태소 분석기
from Morpheme import Morpheme
from EHHelper import EHHelper
class MorphemeParser:
    @staticmethod
    def Parse(src):
        morphes = list() 
        #원본 문자열에 특수 기호를 제거 및 공백 기준으로 분리
        src = EHHelper.EmitTagAndSpecialCh(src)
        msrc = src.split(' ')
        #각 단어를 형태소 컬렉션에 추가
        for elem in msrc:
            if str.isalpha(elem):
                morphes.append(Morpheme(elem))
        #중복 형태소를 합치는 공정
        morphes = MorphemeParser.Merge(morphes)
        return morphes

    @staticmethod
    def Merge(morphes):
        remoes = list()#병합한 형태소를 보관할 컬렉션
        for morph in morphes:#원본 컬렉션에 있는 각각의 형태소를
            rcnt = len(remoes)#병합한 컬렉션에 형태소 개수 구하기
            flag = False #morph와 같은 단어가 remoes에 없다고 가정
            #morph가 remoes컬렉션에 있다면 병합
            for index in range(0,rcnt):
                if remoes[index].word == morph.word:
                    remoes[index].Merge(morph)
                    flag = True#병합하였음을 마킹
                    break            
            if flag == False:#morph와 같은 단어는 remoes에 없음
                remoes.append(morph)
        return remoes

NewsAnalyser.py

NewsAnaylizer.py
0.00MB

 

 

#NewsAnaylizer.py - 뉴스 분석기
from NewsSearcher import NewsSearcher
from MorphemeParser import MorphemeParser
from Morpheme import Morpheme
from News import News
class NewsAnaylizer:
    def __init__(self):
        self.ns = NewsSearcher()
    def Analize(self, query):
        redata = list()
        self.ns.SetQuery(query)
        #news_col = self.ns.RequestAll()
        news_col,total = self.ns.Request(1,10)
        for jnews in news_col:
            news = News.MakeNews(jnews)
            morphes1 = MorphemeParser.Parse(news.title)
            morphes2 = MorphemeParser.Parse(news.description)
            redata.append([news,morphes1,morphes2])
        return redata

 

Main.py

from NewsAnaylizer import NewsAnaylizer
from NewsSql import NewsSql
from MorphemeParser import MorphemeParser
from News import News
from MorphemeSql import MorphemeSql
from InverseSql import InverseSql
na = NewsAnaylizer()
q = input('질의:')
adatas = na.Analize(q)
for adata in adatas:
    news,m1,m2 = adata
    news.PreProcess()
    NewsSql.AddNews(news)
    NewsSql.UpdateMCnt(news.link,len(m1)+len(m2))
    nid = NewsSql.FindNid(news.link)
    for mo in m1:
        MorphemeSql.AddMorpheme(mo)
        mid = MorphemeSql.FindMid(mo.word)
        InverseSql.AddInverseItem(nid,mid,mo.ref)
    for mo in m2:
        MorphemeSql.AddMorpheme(mo)
        mid = MorphemeSql.FindMid(mo.word)
        InverseSql.AddInverseItem(nid,mid,mo.ref)
반응형