Source code for unipy_nlp.tagger

# -*- coding: utf-8 -*-
from __future__ import absolute_import


import os
import re
import io
import json
import sys
import subprocess
import importlib
import pandas as pd

from MeCab import Tagger

from ._backend import (build_mecab, reset_mecabrc,
                       PKG_PATH,
                       MECAB_INSTALLED_DIC_PATH,
                       MECAB_SOURCE_DIC_PATH)

from . import __BUILD_OK__
import sys


__all__ = []
__all__ += [
    'Mecab',
    'build_mecab_user_dic',
]


attrs = ['tags',        # 품사 태그
         'semantic',    # 의미 부류
         'has_jongsung',  # 종성 유무
         'read',        # 읽기
         'type',        # 타입
         'first_pos',   # 첫번째 품사
         'last_pos',    # 마지막 품사
         'original',    # 원형
         'indexed']     # 인덱스 표현

module_installed_path = os.path.dirname(os.path.relpath(__file__))
# dic_installed_path = os.path.join(
#     module_installed_path,
#     '_resources/mecab/mecab/lib/mecab/dic/mecab-ko-dic',
# )
# dic_source_path=os.path.join(
#     module_installed_path,
#     '_resources/mecab/mecab-ko-dic',
# )
# dicpath_to_modulepath = os.path.relpath(
#     module_installed_path,
#     dic_source_path,
# )
module_installed_path = PKG_PATH
dic_installed_path = MECAB_INSTALLED_DIC_PATH
dic_source_path = MECAB_SOURCE_DIC_PATH

def read_json(filename, encoding='utf-8'):
    """JSON file reader."""
    with io.open(filename, 'r', encoding=encoding) as f:
        return json.load(f)


def parse(result, allattrs=False, join=False):
    def split(elem, join=False):
        if not elem:
            return ('', 'SY')
        s, t = elem.split('\t')

        if join:
            return s + '/' + t.split(',', 1)[0]
        else:
            return (s, t.split(',', 1)[0])

    return [split(elem, join=join) for elem in result.splitlines()[:-1]]


[docs]class Mecab(): """Wrapper for MeCab-ko morphological analyzer. `MeCab`_, originally a Japanese morphological analyzer and POS tagger developed by the Graduate School of Informatics in Kyoto University, was modified to MeCab-ko by the `Eunjeon Project`_ to adapt to the Korean language. In order to use MeCab-ko within KoNLPy, follow the directions in :ref:`optional-installations`. .. code-block:: python :emphasize-lines: 1 >>> from unipy_nlp.tagger import Mecab >>> mecab = Mecab() >>> print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.')) ['영등포구', '청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.'] >>> print(mecab.nouns(u'우리나라에는 무릎 치료를 잘하는 정형외과가 없는가!')) ['우리', '나라', '무릎', '치료', '정형외과'] >>> print(mecab.pos(u'자연주의 쇼핑몰은 어떤 곳인가?')) [('자연', 'NNG'), ('주', 'NNG'), ('의', 'JKG'), ('쇼핑몰', 'NNG'), ('은', 'JX'), ('어떤', 'MM'), ('곳', 'NNG'), ('인가', 'VCP+EF'), ('?', 'SF')] :param dicpath: The path of the MeCab-ko dictionary. .. _MeCab: https://taku910.github.io/mecab/ .. _Eunjeon Project: http://eunjeon.blogspot.kr/ """ def __init__( self, dicpath=dic_installed_path, ): try: self.tagger = Tagger('-d %s' % dicpath) self.tagset = read_json( '%s/_resources/mecab/mecab_tagset.json' % module_installed_path ) except RuntimeError: raise Exception('The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath) # except NameError: # raise Exception('Install MeCab in order to use it: http://konlpy.org/en/latest/install/') # TODO: check whether flattened results equal non-flattened
[docs] def pos(self, phrase, flatten=True, join=False): """POS tagger. :param flatten: If False, preserves eojeols. :param join: If True, returns joined sets of morph and tag. """ if sys.version_info[0] < 3: phrase = phrase.encode('utf-8') if flatten: result = self.tagger.parse(phrase).decode('utf-8') return parse(result, join=join) else: return [ parse( self.tagger.parse(eojeol).decode('utf-8'), join=join, ) for eojeol in phrase.split() ] else: if flatten: result = self.tagger.parse(phrase) return parse(result, join=join) else: return [ parse( self.tagger.parse(eojeol).decode('utf-8'), join=join, ) for eojeol in phrase.split() ]
[docs] def morphs(self, phrase): """Parse phrase to morphemes.""" return [s for s, t in self.pos(phrase)]
[docs] def nouns(self, phrase): """Noun extractor.""" tagged = self.pos(phrase) return [s for s, t in tagged if t.startswith('N')]
[docs]def build_mecab_user_dic(nested_list, mode='a'): # sample_list = [ # ['점심시간', 'T'], # ['워라밸', 'T'], # ['의사 결정', 'T'], # ] udf_token = pd.DataFrame( nested_list, columns=['word', 'last_yn'], ).drop_duplicates(subset='word') udf_token['0'] = udf_token['word'] udf_token['1'] = 0 udf_token['2'] = 0 udf_token['3'] = 1 udf_token['4'] = 'NNG' udf_token['5'] = '*' udf_token['6'] = udf_token['last_yn'] udf_token['7'] = udf_token['word'] udf_token['8'] = '*' udf_token['9'] = '*' udf_token['10'] = '*' udf_token['11'] = '*' udf_token['12'] = '*' udf_token_mecab = udf_token.loc[:, udf_token.columns.str.isnumeric()] """ 0 1 2 3 4 5 6 7 8 9 10 11 12 표층형 (표현형태) 좌문맥ID 우문맥ID 출현비용 품사태그 의미부류 종성 유무 읽기 타입 첫번째품사 마지막 품사 서울 0 0 0 NNG 지명 T 서울 * * * * * 불태워졌 0 0 0 VV+EM+VX+EP * T 불태워졌 inflected VV EP * 불태우/VV/+어/EC/+지/VX/+었/EP/ 해수욕장 0 0 0 NNG * T 해수욕장 Compound * * * 해수/NNG/+욕/NNG/+장/NNG/* """ udf_token_mecab.to_csv( os.path.join( dic_source_path, 'user-dic', 'udf.csv', ), mode=mode, header=False, index=False, ) # commands = "cd {MEACB_DIC_DIR} ;bash ./tools/add-userdic.sh ;cd {PKG_DIR}".format( # MEACB_DIC_DIR=dicpath_to_modulepath, # PKG_DIR=module_installed_path, # ) importlib.reload(__BUILD_OK__) if not __BUILD_OK__.__BUILD_OK__: print("RE-BUILD 'MeCab' at the first time to add 'user-defined dictionary'.") build_mecab() importlib.reload(__BUILD_OK__) commands = ";".join( [ "cd {MEACB_DIC_DIR}", "cp ../add-userdic.sh ./tools", "bash ./tools/add-userdic.sh", ] ).format( MEACB_DIC_DIR=dic_source_path, ) out = subprocess.check_output(commands, shell=True) print(out.decode('utf-8')) reset_mecabrc()