|
1、中文分词(smartcn)& z8 q* \9 f5 @5 @" E" Q' |7 M
<dependency>
/ {* H7 y) l) d% o, `9 ] <groupId>org.apache.lucene</groupId>
( B/ N+ i6 r- ?) m0 s$ g, v <artifactId>lucene-analyzers-smartcn</artifactId>" K5 D4 c) ^" V3 E; A
<version>5.3.1</version>2 j6 \$ e8 [4 l. a3 F% A
</dependency>2 }$ l$ ]: f1 |9 n
# e" M" b" @ A( _2 p8 @5 d, I
I3 U- i2 l |
/ Z* C1 P0 j# T4 _% c3 `% j2、高亮显示
: ^+ r$ L+ o0 M: n <dependency> C6 \8 U; H" s& s( u$ c& J' V- _
<groupId>org.apache.lucene</groupId>1 n) D |+ Y8 l2 g" A) h1 c+ t" ^
<artifactId>lucene-highlighter</artifactId>/ N" M3 y, d$ _5 `5 T0 @
<version>5.3.1</version># Z4 \+ W/ h* R8 m9 T
</dependency>
5 J* V% E' v( O& w! Q: m/ Z% f+ i7 r8 Z2 ^) V
源码:
$ ?/ A$ _8 L. o+ X(1)+ a. Q4 L7 S/ C
import java.nio.file.Paths;$ A$ ?& D3 p4 r, h& E! a' |
: w; g# n1 a7 q' Uimport org.apache.lucene.analysis.Analyzer;
- a4 P! s1 `' Y* w& W' timport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;+ G3 y' X- p# o8 ~7 p& R1 a& I4 H
import org.apache.lucene.analysis.standard.StandardAnalyzer;. E$ h2 N: X/ B- L! \
import org.apache.lucene.document.Document;
3 r2 S4 c% f8 @1 v5 A& `3 E0 ximport org.apache.lucene.document.Field;
% B0 Y) t1 }8 o1 d: wimport org.apache.lucene.document.IntField; ]: x5 a, h4 a, Q
import org.apache.lucene.document.StringField;, N& i7 D, a9 c5 Y0 G
import org.apache.lucene.document.TextField;
! f) D5 H; m6 u$ eimport org.apache.lucene.index.IndexWriter;
2 P# J3 j: p* N9 @7 ]# S8 cimport org.apache.lucene.index.IndexWriterConfig;* Q3 r, d# h# m& x0 I8 \4 W
import org.apache.lucene.store.Directory;
5 f6 {$ _7 S4 T. L5 q2 v* v+ ]% Oimport org.apache.lucene.store.FSDirectory;7 U2 `. r+ `2 g& U4 O
- u" c, [ H' F8 |5 _public class Indexer {
5 q' g& m( S7 \
. N4 q* P2 O* F9 m$ X private Integer ids[]={1,2,3};6 L: ^% n7 [. ]" U+ |7 w" e
private String citys[]={"青岛","南京","上海"};4 k# M m0 Y' S' f
private String descs[]={- B5 \1 m. @7 C5 A& A
"青岛是一个美丽的城市。",
" b% a* z1 m* b "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",
& l( e; o1 @# X5 z0 k$ ] "上海是一个繁华的城市。"
+ O. G! ]& {" @$ W! u/ F };0 V+ z2 N/ {/ E. ^
, S9 R; h- Y% R q4 Y private Directory dir;
4 H6 S. L4 S. T4 j; s4 u. G- s+ l4 R$ E0 o$ C6 |* r9 C& b- D; Z
/**
5 O! a9 B) L; A" ~0 D * 获取IndexWriter实例
+ H& h1 U: o# b2 U. w: z5 d * @return G- `2 u4 o. _" t2 M% o
* @throws Exception. {$ y8 J7 P" V1 C& Q
*/
* O3 d7 T/ S$ N. c& W5 c private IndexWriter getWriter()throws Exception{
. T8 e$ c# k, L* u4 w //Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
! |4 d5 B2 p9 }/ U) u' i0 O SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
9 D# Q0 L4 N' j7 v8 m3 j+ p7 Z IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
+ E; K# w: {( P; _% }2 `5 e IndexWriter writer=new IndexWriter(dir, iwc);
5 C. s [3 D7 b' v( J return writer;
, y+ ^$ Y- [( A. c, [3 p1 R/ l }3 S. r; t; |( z% J B3 g
2 d) t7 o( ]. I /**
, ^5 y5 |* g1 O% f* ~- m * 生成索引4 s2 a# q! d. g: F
* @param indexDir* R# p8 |/ s6 U7 N) M7 m& b
* @throws Exception) O7 d( d7 m, v5 {8 Y
*/
" i: A9 Z4 v6 H! G private void index(String indexDir)throws Exception{
1 y, e, d8 q/ K5 g1 f0 y* {% p dir=FSDirectory.open(Paths.get(indexDir));6 i' v: ? V3 w: P# _
IndexWriter writer=getWriter();
' c7 n0 m0 A5 Y/ z6 p6 [ for(int i=0;i<ids.length;i++){; D. L) E7 a$ q \; l9 x3 y
Document doc=new Document();' r8 ?# G) U- h5 f' m3 ]
doc.add(new IntField("id", ids, Field.Store.YES));
' ^8 B/ M) l0 c3 P, ]! x doc.add(new StringField("city",citys,Field.Store.YES));; _% ^+ g3 W9 ?( q* m. g/ v0 t+ {
doc.add(new TextField("desc", descs, Field.Store.YES));) e3 h6 ~1 y. _3 t
writer.addDocument(doc); // 添加文档
2 D6 {- l5 c- @ X1 T7 v4 d }. s' U- ?/ t( Z. d) k* J! C
writer.close();! M/ u' v$ r- s6 {5 j
}5 O) h( s7 n: H, a0 j8 A
- \1 U" `7 y. Y; M5 f" ]
! x: M' M% d& [& N: \ public static void main(String[] args) throws Exception {! o9 [8 D& i+ p% P3 Z h
new Indexer().index("D:\\lucene6");, ~1 V; ^! S; x E% x8 x
}
$ Y6 z/ n2 ]) t' i: w k5 ~$ v& s! d/ l5 x" i" J$ V3 Q
}
8 i: N) ^" d- l+ W6 j
, E) u* l. s; R0 N
$ G. @4 ]/ M1 W$ P' I
) b8 F5 ~5 g8 x/ r(2)9 c5 j8 V" h5 e& i1 ^
import java.io.StringReader;6 O( Y) }& h N+ } A
import java.nio.file.Paths;) F" B3 V/ [+ l3 j
- g; h' B! y$ N- [/ O( s- N
import org.apache.lucene.analysis.Analyzer;
: W) b7 i7 ?# K5 r# Nimport org.apache.lucene.analysis.TokenStream;2 `2 O* Y- V6 @% @6 J! l
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;0 R) r! B# x4 e7 L H. I8 Q+ p2 @
import org.apache.lucene.analysis.standard.StandardAnalyzer;
& H0 A+ B4 k0 ^2 l& Q, ]5 uimport org.apache.lucene.document.Document;
, y* { J( [7 e; o8 bimport org.apache.lucene.index.DirectoryReader; X* `0 \" r* \( _
import org.apache.lucene.index.IndexReader;; G( {. W1 g9 R: T' i: f5 W
import org.apache.lucene.queryparser.classic.QueryParser;
5 d1 a: A9 n9 z# w3 Eimport org.apache.lucene.search.IndexSearcher;: S- G2 X& A8 E* ~8 {* U) O- \- Y3 P
import org.apache.lucene.search.Query;
3 ^( J. `0 L* vimport org.apache.lucene.search.ScoreDoc;5 _% O& V8 ]+ Q( {
import org.apache.lucene.search.TopDocs;9 w) T- X d" t0 V; S
import org.apache.lucene.search.highlight.Fragmenter;. S y- A( j/ ]. H( B% U `$ z
import org.apache.lucene.search.highlight.Highlighter;4 @( T, t1 j3 W9 v- @, B. k; p( P
import org.apache.lucene.search.highlight.QueryScorer;
. ~3 }/ f6 _* W0 cimport org.apache.lucene.search.highlight.SimpleHTMLFormatter;
8 f4 ?' h0 v0 n2 L* b; o4 \import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
! \9 ]/ K5 C, Y: cimport org.apache.lucene.store.Directory;; z! \3 z/ D- ]- B2 N
import org.apache.lucene.store.FSDirectory;: Y/ {5 U- V* K/ a3 K
0 B" [' G6 y% O% w' l0 {public class Searcher {7 g7 ]2 V# C$ ?0 M, {% y# d
: T4 r$ ^4 Z8 J. i" o public static void search(String indexDir,String q)throws Exception{1 ]3 u! g3 S0 M3 U
Directory dir=FSDirectory.open(Paths.get(indexDir));" F$ ?! n. r2 a. a' g
IndexReader reader=DirectoryReader.open(dir);
. k0 A+ @. H8 O) s4 j2 H0 ^ IndexSearcher is=new IndexSearcher(reader);
% V2 x( b& G ]3 R& j0 N // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
! B* o. m# E1 u8 F, P3 L7 a/ D( v SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
$ i4 D0 ~+ ^6 w# P QueryParser parser=new QueryParser("desc", analyzer);
/ D( h" Y( ] t) i+ N, x0 X Query query=parser.parse(q);. N b4 |! f: J% ~: X
long start=System.currentTimeMillis();
8 r* x0 O! R: V! Q% z% }+ a. E% L TopDocs hits=is.search(query, 10);
6 Q. ~ Z3 z! L b; h8 g long end=System.currentTimeMillis();$ C/ Q, u# r- R6 e. F
System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
' Y+ L4 |) p* Y8 a) e8 `+ R# B* t$ c* b2 E! t
QueryScorer scorer=new QueryScorer(query);2 Q( _8 a, k! ]# n- B, G) F1 m
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);8 q; T" |" T$ e
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");- G0 b6 M+ w* P* H) j; d
Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);/ B5 I7 c, D6 x( _
highlighter.setTextFragmenter(fragmenter);2 l3 c i& d( B
for(ScoreDoc scoreDoc:hits.scoreDocs){& o8 \5 \+ @8 |; y. x2 l9 [6 ?. C* t1 A
Document doc=is.doc(scoreDoc.doc);
4 B; V2 \9 h* g" t System.out.println(doc.get("city"));
0 K! J- O' p5 p System.out.println(doc.get("desc"));
1 _' R$ N' E: E# E String desc=doc.get("desc");9 [& c3 t* }; V; o# A6 N6 M
if(desc!=null){& i" F. Q- y2 O% ` ^$ S
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));6 l o* a) E' X B T4 ^
System.out.println(highlighter.getBestFragment(tokenStream, desc));
* M0 H1 H8 v: c8 u# A9 a) {9 n2 y V, x }9 ^' x0 y% U. W# l6 J
} `3 F& g, u8 `8 D
reader.close();8 x# T% _7 c/ x+ O5 a
}
0 f- @ g# @, m1 ]
$ Z$ V8 T& e- c* M; b public static void main(String[] args) {
3 `. D0 r7 I5 V A% S String indexDir="D:\\lucene6";
3 Z; F7 R$ b& t2 n9 @. O4 | String q="南京文明";# I' _" x; n6 {3 m- u
try {
5 J5 R0 ]/ e: R+ C) U search(indexDir,q);
$ W, W5 m( t t' _* P5 x( H5 h" I } catch (Exception e) {
- P2 R, w/ X$ {) C1 a6 K // TODO Auto-generated catch block
) N7 j8 V* N. k2 W7 E. Y2 X v e.printStackTrace();
6 U' X1 m6 i& z" P }8 A8 k* I. V7 v, [- O- t1 I; t/ T
}
/ ]/ {: q+ T7 B( @}0 z; q, \0 }0 ]4 u! r' M5 r6 ^
# S- Y. Z, q, M2 a; X! @8 z" u: L5 R7 k2 o! A; O
5 o" O& h* A+ q, }( S" {/ ^/ K% W
) W* ]. {+ P: \# @: `" X& h! O M- u( l$ b
|
|