关于使用Filter减少Lucene tf idf打分计算的调研

将query改成filter, lucene中有个QueryWrapperFilter性能比 较差,所以基本上都需要自己写filter, 包括TermFilter, ExactPhraseFilter, ConjunctionFilter, DisjunctionFilter。

这几天验证下来,还是or改善最明显,4个termfilter,4508个返回结果,在我本机上性能提高1/3。ExactPhraseFilter也有小幅提升(5%-10%)。

最令人不解的是and,原来以为跟结果数和子查询数相关,但几次测试基本都是下降。

附ExactPhraseFilter和ut代码:

import java.io.IOException;
import java.util.ArrayList;

import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;

// A fake to lucene phrase query, but far simplified.
public class ExactPhraseFilter extends Filter {
	protected final ArrayList<Term> terms = new ArrayList<Term>();
	protected final ArrayList<Integer> positions = new ArrayList<Integer>();
	
	protected String fieldName;
	
	public void add(Term term) {
		if (terms.size() == 0) {
			fieldName = term.field();
		} else {
			assert fieldName == term.field();
		}
		positions.add(Integer.valueOf(terms.size()));
		terms.add(term);
	}
	
	@Override
	public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException
	{
		return new ExactPhraseDocIdSet(context, acceptDocs);
	}
	
	static class PostingAndFreq implements Comparable<PostingAndFreq> {
		DocsAndPositionsEnum posEnum;
		int docFreq;
		int position;
		boolean useAdvance;
		int posFreq = 0;
		int pos = -1;
		int posTime = 0;
		
		public PostingAndFreq(DocsAndPositionsEnum posEnum, int docFreq, int position, boolean useAdvance) {
			this.posEnum = posEnum;
			this.docFreq = docFreq;
			this.position = position;
			this.useAdvance = useAdvance;
		}
	 
		@Override
		public int compareTo(PostingAndFreq other) {
			if (docFreq != other.docFreq) {
				return docFreq - other.docFreq;
			}
			if (position != other.position) {
				return position - other.position;
			}
			return 0;
		}
	}
	
	protected class ExactPhraseDocIdSet extends DocIdSet {
		protected final AtomicReaderContext context;
		protected final Bits acceptDocs;
		protected final PostingAndFreq[] postings;
		protected boolean noDocs = false;
		
		public ExactPhraseDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
			this.context = context;
			this.acceptDocs = acceptDocs;
			
			Terms fieldTerms = context.reader().fields().terms(fieldName);
			// TermContext states[] = new TermContext[terms.size()];
			postings = new PostingAndFreq[terms.size()];
			
			TermsEnum te = fieldTerms.iterator(null);
			for (int i = 0; i < terms.size(); ++i) {
				final Term t = terms.get(i);
				// states[i] = TermContext.build(context, terms.get(i), true);
				// final TermState state = states[i].get(context.ord);
				if (!te.seekExact(t.bytes(), true)) {
					noDocs = true;
					return;
				}
				if (i == 0) {
					postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), false);
				} else {
					postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), te.docFreq() > 5 * postings[0].docFreq);
				}
			}
			
			ArrayUtil.mergeSort(postings);
			for (int i = 1; i < terms.size(); ++i) {
				postings[i].posEnum.nextDoc();
			}
		}
		
		@Override
		public DocIdSetIterator iterator() throws IOException
		{
			if (noDocs) {
				return EMPTY_DOCIDSET.iterator();
			} else {
				return new ExactPhraseDocIdSetIterator(context, acceptDocs);
			}
		}
		
		protected class ExactPhraseDocIdSetIterator extends DocIdSetIterator {
			protected int docID = -1;
			
			public ExactPhraseDocIdSetIterator(AtomicReaderContext context, Bits acceptDocs) throws IOException {
			}
			
			@Override
			public int nextDoc() throws IOException {
				while (true) {
					// first (rarest) term
					final int doc = postings[0].posEnum.nextDoc();
					if (doc == DocIdSetIterator.NO_MORE_DOCS) {
						// System.err.println("END");
						return docID = doc;
					}
					
					// non-first terms
					int i = 1;
					while (i < postings.length) {
						final PostingAndFreq pf = postings[i];
						int doc2 = pf.posEnum.docID();
						if (pf.useAdvance) {
							if (doc2 < doc) {
								doc2 = pf.posEnum.advance(doc);
							}
						} else {
							int iter = 0;
							while (doc2 < doc) {
								if (++iter == 50) {
									doc2 = pf.posEnum.advance(doc);
								} else {
									doc2 = pf.posEnum.nextDoc();
								}
							}
						}
						if (doc2 > doc) {
							break;
						}
						++i;
					}
					
					if (i == postings.length) {
						// System.err.println(doc);
						docID = doc;
						// return docID;
						if (containsPhrase()) {
							return docID;
						}
					}
				}
			}
			
			@Override
			public int advance(int target) throws IOException {
				throw new IOException();
			}
			
			private boolean containsPhrase() throws IOException {
				int index = -1;
				int i = 0;
				PostingAndFreq pf;
				
				// init.
				for (i = 0; i < postings.length; ++i) {
					postings[i].posFreq = postings[i].posEnum.freq();
					postings[i].pos = postings[i].posEnum.nextPosition() - postings[i].position;
					postings[i].posTime = 1;
				}
				
				while (true) {
					pf = postings[0];
					
					// first term.
					while (pf.pos < index && pf.posTime < pf.posFreq) {
						pf.pos = pf.posEnum.nextPosition() - pf.position;
						++pf.posTime;
					}
					if (pf.pos >= index) {
						index = pf.pos;
					} else if (pf.posTime == pf.posFreq) {
						return false;
					}
					
					// other terms.
					for (i = 1; i < postings.length; ++i) {
						pf = postings[i];
						while (pf.pos < index && pf.posTime < pf.posFreq) {
							pf.pos = pf.posEnum.nextPosition() - pf.position;
							++pf.posTime;
						}
						if (pf.pos > index) {
							index = pf.pos;
							break;
						}
						if (pf.pos == index) {
							continue;
						}
						if (pf.posTime == pf.posFreq) {
return false;
						}
					}
					if (i == postings.length) {
						return true;
					}
				}
			}

			@Override
			public int docID()
			{
				return docID;
			}
		}

	}
	
}

UT:

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.testng.annotations.AfterTest;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;

import com.dp.arts.lucenex.codec.Dp10Codec;

public class ExactPhraseFilterTest
{
	final Directory dir = new RAMDirectory();
	
	@BeforeTest
	public void setUp() throws IOException {
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
		IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);
		iwc.setOpenMode(OpenMode.CREATE);
		iwc.setCodec(Codec.forName(Dp10Codec.DP10_CODEC_NAME));
		
		IndexWriter writer = new IndexWriter(dir, iwc); 
		addDocument(writer, "新疆烧烤");  // 0
		addDocument(writer, "啤酒");  // 1
		addDocument(writer, "烤烧");  // 2
		addDocument(writer, "烧烧烧");  // 3
		addDocument(writer, "烤烧中华烧烤"); // 4
		writer.close();
	}
	
	private void addDocument(IndexWriter writer, String str) throws IOException {
		Document doc = new Document();
		doc.add(new TextField("searchkeywords", str, Store.YES));
		writer.addDocument(doc, new StandardAnalyzer(Version.LUCENE_40));
	}
	
	@AfterTest
	public void tearDown() throws IOException
	{
		this.dir.close();
	}
	
	@Test
	public void test1() throws IOException
	{
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		
		ExactPhraseFilter pf = new ExactPhraseFilter();
		pf.add(new Term("searchkeywords", "烧"));
		pf.add(new Term("searchkeywords", "烤"));
		Query query = new ConstantScoreQuery(pf);
		TopDocs results = searcher.search(query, 20);
		
		assert results.totalHits == 2;
		assert results.scoreDocs[0].doc == 0;
		assert results.scoreDocs[1].doc == 4;
		
		searcher.getIndexReader().close();
	}
}
我来评几句
登录后评论

已发表评论数()

相关站点

+订阅
热门文章