Elasticsearch6

elasticsearch 6.* 기본기-4 한국어형태소분석기-nori

Jack Moon 2018. 10. 26. 13:30

품사태그표.xlsx

노리의 품사 태그표 입니다. 노란색 품사태그는 nori_part_of_speech 토큰필터 적용시 제외되는 품사입니다.




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
##############################
# nori analyzer
##############################
# https://www.elastic.co/guide/en/elasticsearch/plugins/6.4/analysis-nori.html
# nori analyzer 는 다음과 같은 tokenizer 와 token filters로 구성되어 있다
 
# nori_tokenizer
# nori_part_of_speech token filter
# nori_readingform token filter
# lowercase token filter
 
# 또한 decompound_mode 와 user_dictionary 를 지원하고 stoptags를 설정할 수 있다
 
##############################
1. nori_tokenizer
# decompound_mode : 토큰을 처리하는 방법을 결정
#    none: 분해하지 않는다
#    discard: 분해하고 원본 삭제 (default)
#    mixed: 분해하고 원본도 유지
# user_dictionary
#    위치: $ES_HOME/config/userdic_sample.txt
#       c++                 
#       C샤프
#       세종
#       세종시 세종 시 
DELETE nori_sample
 
PUT nori_sample
{
  "settings": {
    "index": {
      "analysis": {
        "tokenizer": {
          "nori_user_mixed": {
            "type""nori_tokenizer",
            "decompound_mode""mixed",
            "user_dictionary""userdic_sample.txt"
          },
          "nori_user_dict_discard": {
            "type""nori_tokenizer",
            "user_dictionary""userdic_sample.txt"
          },
          "nori_user_dict_none": {
            "type""nori_tokenizer",
            "decompound_mode""none",
            "user_dictionary""userdic_sample.txt"
          }          
        },
        "analyzer": {
          "analyzer_mixed": {
            "type""custom",
            "tokenizer""nori_user_mixed"
          },
          "analyzer_discard": {
            "type""custom",
            "tokenizer""nori_user_dict_discard"
          },
          "analyzer_none": {
            "type""custom",
            "tokenizer""nori_user_dict_none"
          }        
        }
      }
    }
  }
}
 
GET nori_sample/_analyze
{
  "analyzer""analyzer_mixed",
  "text""(서울=연합뉴스) 김동규 기자 = 슈퍼 태풍 '위투'가 서태평양을 강타하면서 사이판공항이 폐쇄돼 26일 오전 10시(현지시간) 현재 사이판을 오가는 하늘길이 모두 막혔다."  
}
 
GET nori_sample/_analyze
{
  "analyzer""analyzer_discard",
  "text""(서울=연합뉴스) 김동규 기자 = 슈퍼 태풍 '위투'가 서태평양을 강타하면서 사이판공항이 폐쇄돼 26일 오전 10시(현지시간) 현재 사이판을 오가는 하늘길이 모두 막혔다."  
}
 
GET nori_sample/_analyze
{
  "analyzer""analyzer_none",
  "text""소녀시대는 노래를 잘한다"  
}
 
# 토큰의 속성을 좀더 자세히 보려면
GET nori_sample/_analyze
{
  "analyzer""analyzer_none",
  "text""소녀시대는 노래를 잘한다",
  "attributes" : ["posType""leftPOS""rightPOS""morphemes""reading"],
  "explain"true
}
 
##############################
2. nori_part_of_speech token filter
# 특정 품사 태그를 제거한다.
# 기본값은
#"stoptags": [
#    "E",
#    "IC",
#    "J",
#    "MAG""MAJ""MM",
#    "SP""SSC""SSO""SC""SE",
#    "XPN""XSA""XSN""XSV",
#    "UNA""NA""VSV"
#]
 
DELETE nori_sample
 
PUT nori_sample
{
  "settings": {
    "index": {
      "analysis": {
        "tokenizer": {
          "nori_user_dict_none": {
            "type""nori_tokenizer",
            "decompound_mode""none",
            "user_dictionary""userdic_sample.txt"
          }          
        },
        "filter": {
          "my_filter": {
            "type""nori_part_of_speech",
             "stoptags": [
              "NR"   
             ]
          }
        },
        "analyzer": {
          "analyzer_none": {
            "type""custom",
            "tokenizer""nori_user_dict_none",
            "filter": ["my_filter"]
          }        
        }
      }
    }
  }
}
 
GET nori_sample/_analyze
{
  "analyzer""analyzer_none",
  "text""여섯 용이 소녀시대는 노래를 잘한다"
}
 
##############################
3. nori_readingform token filter
# 한자를 한글로 변환하는 필터
 
cs


품사태그표.xlsx
0.01MB