# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
import re
import io
import json
import sys
import subprocess
import importlib
import pandas as pd
from MeCab import Tagger
from ._backend import (build_mecab, reset_mecabrc,
PKG_PATH,
MECAB_INSTALLED_DIC_PATH,
MECAB_SOURCE_DIC_PATH)
from . import __BUILD_OK__
import sys
__all__ = []
__all__ += [
'Mecab',
'build_mecab_user_dic',
]
attrs = ['tags', # 품사 태그
'semantic', # 의미 부류
'has_jongsung', # 종성 유무
'read', # 읽기
'type', # 타입
'first_pos', # 첫번째 품사
'last_pos', # 마지막 품사
'original', # 원형
'indexed'] # 인덱스 표현
module_installed_path = os.path.dirname(os.path.relpath(__file__))
# dic_installed_path = os.path.join(
# module_installed_path,
# '_resources/mecab/mecab/lib/mecab/dic/mecab-ko-dic',
# )
# dic_source_path=os.path.join(
# module_installed_path,
# '_resources/mecab/mecab-ko-dic',
# )
# dicpath_to_modulepath = os.path.relpath(
# module_installed_path,
# dic_source_path,
# )
module_installed_path = PKG_PATH
dic_installed_path = MECAB_INSTALLED_DIC_PATH
dic_source_path = MECAB_SOURCE_DIC_PATH
def read_json(filename, encoding='utf-8'):
"""JSON file reader."""
with io.open(filename, 'r', encoding=encoding) as f:
return json.load(f)
def parse(result, allattrs=False, join=False):
def split(elem, join=False):
if not elem:
return ('', 'SY')
s, t = elem.split('\t')
if join:
return s + '/' + t.split(',', 1)[0]
else:
return (s, t.split(',', 1)[0])
return [split(elem, join=join) for elem in result.splitlines()[:-1]]
[docs]class Mecab():
"""Wrapper for MeCab-ko morphological analyzer.
`MeCab`_, originally a Japanese morphological analyzer and POS tagger
developed by the Graduate School of Informatics in Kyoto University,
was modified to MeCab-ko by the `Eunjeon Project`_
to adapt to the Korean language.
In order to use MeCab-ko within KoNLPy, follow the directions in
:ref:`optional-installations`.
.. code-block:: python
:emphasize-lines: 1
>>> from unipy_nlp.tagger import Mecab
>>> mecab = Mecab()
>>> print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.'))
['영등포구', '청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.']
>>> print(mecab.nouns(u'우리나라에는 무릎 치료를 잘하는 정형외과가 없는가!'))
['우리', '나라', '무릎', '치료', '정형외과']
>>> print(mecab.pos(u'자연주의 쇼핑몰은 어떤 곳인가?'))
[('자연', 'NNG'), ('주', 'NNG'), ('의', 'JKG'), ('쇼핑몰', 'NNG'), ('은', 'JX'), ('어떤', 'MM'), ('곳', 'NNG'), ('인가', 'VCP+EF'), ('?', 'SF')]
:param dicpath: The path of the MeCab-ko dictionary.
.. _MeCab: https://taku910.github.io/mecab/
.. _Eunjeon Project: http://eunjeon.blogspot.kr/
"""
def __init__(
self,
dicpath=dic_installed_path,
):
try:
self.tagger = Tagger('-d %s' % dicpath)
self.tagset = read_json(
'%s/_resources/mecab/mecab_tagset.json' % module_installed_path
)
except RuntimeError:
raise Exception('The MeCab dictionary does not exist at "%s". Is the dictionary correctly installed?\nYou can also try entering the dictionary path when initializing the Mecab class: "Mecab(\'/some/dic/path\')"' % dicpath)
# except NameError:
# raise Exception('Install MeCab in order to use it: http://konlpy.org/en/latest/install/')
# TODO: check whether flattened results equal non-flattened
[docs] def pos(self, phrase, flatten=True, join=False):
"""POS tagger.
:param flatten: If False, preserves eojeols.
:param join: If True, returns joined sets of morph and tag.
"""
if sys.version_info[0] < 3:
phrase = phrase.encode('utf-8')
if flatten:
result = self.tagger.parse(phrase).decode('utf-8')
return parse(result, join=join)
else:
return [
parse(
self.tagger.parse(eojeol).decode('utf-8'),
join=join,
)
for eojeol in phrase.split()
]
else:
if flatten:
result = self.tagger.parse(phrase)
return parse(result, join=join)
else:
return [
parse(
self.tagger.parse(eojeol).decode('utf-8'),
join=join,
)
for eojeol in phrase.split()
]
[docs] def morphs(self, phrase):
"""Parse phrase to morphemes."""
return [s for s, t in self.pos(phrase)]
[docs] def nouns(self, phrase):
"""Noun extractor."""
tagged = self.pos(phrase)
return [s for s, t in tagged if t.startswith('N')]
[docs]def build_mecab_user_dic(nested_list, mode='a'):
# sample_list = [
# ['점심시간', 'T'],
# ['워라밸', 'T'],
# ['의사 결정', 'T'],
# ]
udf_token = pd.DataFrame(
nested_list,
columns=['word', 'last_yn'],
).drop_duplicates(subset='word')
udf_token['0'] = udf_token['word']
udf_token['1'] = 0
udf_token['2'] = 0
udf_token['3'] = 1
udf_token['4'] = 'NNG'
udf_token['5'] = '*'
udf_token['6'] = udf_token['last_yn']
udf_token['7'] = udf_token['word']
udf_token['8'] = '*'
udf_token['9'] = '*'
udf_token['10'] = '*'
udf_token['11'] = '*'
udf_token['12'] = '*'
udf_token_mecab = udf_token.loc[:, udf_token.columns.str.isnumeric()]
"""
0 1 2 3 4 5 6 7 8 9 10 11 12
표층형 (표현형태) 좌문맥ID 우문맥ID 출현비용 품사태그 의미부류 종성 유무 읽기 타입 첫번째품사 마지막 품사
서울 0 0 0 NNG 지명 T 서울 * * * * *
불태워졌 0 0 0 VV+EM+VX+EP * T 불태워졌 inflected VV EP * 불태우/VV/+어/EC/+지/VX/+었/EP/
해수욕장 0 0 0 NNG * T 해수욕장 Compound * * * 해수/NNG/+욕/NNG/+장/NNG/*
"""
udf_token_mecab.to_csv(
os.path.join(
dic_source_path,
'user-dic',
'udf.csv',
),
mode=mode,
header=False,
index=False,
)
# commands = "cd {MEACB_DIC_DIR} ;bash ./tools/add-userdic.sh ;cd {PKG_DIR}".format(
# MEACB_DIC_DIR=dicpath_to_modulepath,
# PKG_DIR=module_installed_path,
# )
importlib.reload(__BUILD_OK__)
if not __BUILD_OK__.__BUILD_OK__:
print("RE-BUILD 'MeCab' at the first time to add 'user-defined dictionary'.")
build_mecab()
importlib.reload(__BUILD_OK__)
commands = ";".join(
[
"cd {MEACB_DIC_DIR}",
"cp ../add-userdic.sh ./tools",
"bash ./tools/add-userdic.sh",
]
).format(
MEACB_DIC_DIR=dic_source_path,
)
out = subprocess.check_output(commands, shell=True)
print(out.decode('utf-8'))
reset_mecabrc()