1. 程式人生 > >Lucene 7.2.1 自定義Analyzer和TokenFilter

Lucene 7.2.1 自定義Analyzer和TokenFilter

oge close protect .get one AC put stand hash

1.自定義Analyzer:

import com.dys.lucene.filter.SameWordTokenFilter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class SameWordAnalyzer extends Analyzer {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {

        StandardTokenizer standardTokenizer 
= new StandardTokenizer(); SameWordTokenFilter sameWordTokenFilter = new SameWordTokenFilter(standardTokenizer); TokenStreamComponents tokenStreamComponents = new TokenStreamComponents(standardTokenizer, sameWordTokenFilter); return tokenStreamComponents; } }

2.自定義TokenFilter

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import java.io.IOException;
import java.util.HashMap;
import
java.util.Map; import java.util.Stack; public class SameWordTokenFilter extends TokenFilter { private CharTermAttribute charTermAttribute; private PositionIncrementAttribute positionIncrementAttribute; private State state; private Stack<String> stack; public SameWordTokenFilter(TokenStream input) { super(input); this.stack = new Stack<>(); this.charTermAttribute = this.addAttribute(CharTermAttribute.class); this.positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class); this.stack = new Stack<>(); } @Override public final boolean incrementToken() throws IOException { while (this.stack.size() > 0) { this.restoreState(this.state); this.charTermAttribute.setEmpty(); this.charTermAttribute.append(this.stack.pop()); this.positionIncrementAttribute.setPositionIncrement(0); return true; } if (!this.input.incrementToken()) { return false; } String term = this.charTermAttribute.toString(); if (this.getSameWords(term)) { this.state = this.captureState(); } return true; } private boolean getSameWords(String name) { Map<String, String[]> map = new HashMap<>(); map.put("美", new String[]{"美麗", "好看"}); map.put("花", new String[]{"鮮花", "花朵"}); String[] words = map.get(name); if (words != null) { for (String word : words) { this.stack.push(word); } return true; } return false; } }

3.使用自定義Analyzer和自定義TokenFilter

ArrayList<String> strings = new ArrayList<String>() {{
            this.add("小鬼子");
            this.add("美國佬");
        }};
        Analyzer analyzer = new CustomStandardAnalyzer(strings);
        String content = "小鬼子 and 美國佬 are playing together!";
        TokenStream tokenStream = analyzer.tokenStream("myfield", content);
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            // 已經過濾掉自定義停用詞
            // 輸出:playing   together
            System.out.println(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        
        analyzer.close();

4.代碼解釋,具體Analyzer和 TokenFilter之間的關聯,用Eclipse的DEBUG功能,跟蹤理解。

Lucene 7.2.1 自定義Analyzer和TokenFilter