|
1、中文分词(smartcn)8 G3 d" K; b4 i- H8 b
<dependency># p3 q- Q: Y6 W, w1 k4 L
<groupId>org.apache.lucene</groupId>
( J1 U$ l8 W5 F <artifactId>lucene-analyzers-smartcn</artifactId>
/ T7 T) A% ^2 f0 Z( ] b- X3 p <version>5.3.1</version>* |4 ^% r5 N% _ O, N. F1 \4 v9 T) o
</dependency>
, r2 v2 _6 s- ]9 Q M) \+ Z1 I1 C$ `0 ]3 T0 Z- p6 }
7 C' ~7 ^3 i" e" p+ }
* Y* S* Z! b4 z, Y9 {/ C$ I8 W2、高亮显示8 s, e5 k( [5 _- t5 j" `( G
<dependency>
4 R" U# m1 f4 C W. F <groupId>org.apache.lucene</groupId>3 Y v6 q* |6 i) X' ^4 ^% L
<artifactId>lucene-highlighter</artifactId>
8 Y, |' P3 g t <version>5.3.1</version>
2 J& q9 u. ]6 x4 f6 u5 u U& ` </dependency>
+ I! T' }( d% Q! s
3 {9 R2 U& A2 m$ {源码:
: }9 O! S0 g6 D6 k(1)
: U, A/ a. Q5 Y0 \9 y* Oimport java.nio.file.Paths;4 d- ~8 o' h4 O3 W. A/ C6 |
3 x5 o, A, h# Q% p5 s9 r, @
import org.apache.lucene.analysis.Analyzer;
3 ?$ P) b, G7 {7 Zimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;* m" f0 y; u8 X2 n3 R" } i
import org.apache.lucene.analysis.standard.StandardAnalyzer;: O' W Z* l& Q {( [
import org.apache.lucene.document.Document;
' @' a! s5 }& R- a0 Pimport org.apache.lucene.document.Field;
Q: f( c* I7 C7 H& d; s3 f" Kimport org.apache.lucene.document.IntField;
9 d1 L& L1 g7 a; f& ~, t3 }import org.apache.lucene.document.StringField;' ^& B. L0 n# e9 v# N
import org.apache.lucene.document.TextField;3 n9 l8 Z5 o' G9 L* P2 J% G4 `
import org.apache.lucene.index.IndexWriter;- t: q+ o7 }4 V- D4 f4 y, k
import org.apache.lucene.index.IndexWriterConfig;
. t4 ^. s. l7 {$ j2 gimport org.apache.lucene.store.Directory;
/ t9 S4 o( j+ v7 dimport org.apache.lucene.store.FSDirectory;
9 ^7 h' @; C1 ~" }1 B$ Y+ h! a$ d$ u
public class Indexer {
( z) [6 H+ J' m, J/ u# v4 F0 K7 d
private Integer ids[]={1,2,3};
' E% `+ X5 }$ y private String citys[]={"青岛","南京","上海"};
. V9 _! \! ?# b( Z private String descs[]={
) _+ k9 h& K+ E X; T "青岛是一个美丽的城市。",
5 g1 Y! Q% k+ X) M2 C; ` "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",+ ~6 S4 f4 U/ W2 A# k: K
"上海是一个繁华的城市。"
* L; _- a3 ?& ` I4 A! u! _7 Q };4 j0 m' N0 K& ]( Y: `
# Y6 _* E4 S! t. g5 M+ d3 U private Directory dir;' \# P" P3 @% r' t+ d9 O
7 G4 Q1 n0 Q7 @* W! Y /**# g7 r" P5 K. C% p2 k
* 获取IndexWriter实例8 q7 \6 ^- r3 Z1 d& S
* @return" c# Q; O" \- z7 D
* @throws Exception/ o$ N( B3 C! s! P
*/
, X5 ~+ R$ L6 P& m* K private IndexWriter getWriter()throws Exception{2 q+ \5 ^* c0 }$ k: p# l
//Analyzer analyzer=new StandardAnalyzer(); // 标准分词器 V3 X% H8 ~; u
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();: q* s& B8 C$ C" V
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);) ^( J6 s a5 F l1 S; N0 z7 j$ }
IndexWriter writer=new IndexWriter(dir, iwc);
' U- x" P- C/ E/ } |. K. ]- j X0 i return writer;
* Q/ p- ^' q, j+ y3 r* P }7 `! V! T; k& _4 n3 f/ V \* j1 Z) |
. m8 r S5 v* }
/**
6 Q- k# Q$ |2 I5 r8 \ * 生成索引
8 M1 i% Q# Q: ~. y4 }* `8 \ * @param indexDir
* u( s# t8 p5 r! D * @throws Exception
! Y7 }! _! I0 B% i* o */' m a/ Y$ x; B: \6 d' J; g4 u5 _
private void index(String indexDir)throws Exception{
1 B* f6 I$ E2 M' @, w9 b; X dir=FSDirectory.open(Paths.get(indexDir));7 t( N0 z! Z/ k5 t0 D; c) b
IndexWriter writer=getWriter();
2 W- p. S' S$ f- \; I4 K( q: w0 B for(int i=0;i<ids.length;i++){
! X; A9 a0 @$ c Document doc=new Document();
2 l# L& H; Z" `! @! f0 w- d( j doc.add(new IntField("id", ids, Field.Store.YES));8 F( M/ g: H1 s. v6 |0 g+ ^- m; B1 a
doc.add(new StringField("city",citys,Field.Store.YES));6 Y* Y2 l9 D! s; ~7 b
doc.add(new TextField("desc", descs, Field.Store.YES));3 s# X$ [. _ r9 Q
writer.addDocument(doc); // 添加文档' x8 a- `1 C: ]* C
}: n$ A ^) j9 P% Z* _" c
writer.close();( n( f+ x. \1 y- J1 \
}8 M- F6 x- }5 ]& A5 x
4 l& m g7 E: } O# c# u& y5 X, y; q: C2 h$ f
public static void main(String[] args) throws Exception {
$ w" ~8 f/ J4 B9 [3 @+ ? new Indexer().index("D:\\lucene6");
8 u. s4 {# d2 s. | }
; @& u/ H- g# f# G! t. W, {. K- y! Q4 @- n/ c& Z
}
& B; y Z5 Y9 B' B- m) t/ A }: h9 F) Q# L: a
1 a, Z# S+ ]0 b4 `7 J
; i! Z: P! X" n! ?
(2)9 p# N4 x" f: \. W6 G& R8 {* h9 _
import java.io.StringReader;
7 q, k/ m" Z" x. Mimport java.nio.file.Paths;, K0 h" }9 V( P1 f. l1 @* j V
" Z" L- Q# y3 x9 H5 [) @5 y5 wimport org.apache.lucene.analysis.Analyzer;
. X1 @8 [( _# s3 e$ Pimport org.apache.lucene.analysis.TokenStream;5 _$ Y8 Z0 }: U0 O9 o4 j
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;5 |" {: a3 d5 @1 V
import org.apache.lucene.analysis.standard.StandardAnalyzer;
o# W4 @ a2 l& L' Z2 o1 }import org.apache.lucene.document.Document;! Y( p9 W2 J0 ~; {, A. z
import org.apache.lucene.index.DirectoryReader;
3 j/ c# s4 O7 O- ~% c3 F0 Iimport org.apache.lucene.index.IndexReader;
. |" v" P( o% U/ A3 P( ^5 C" Pimport org.apache.lucene.queryparser.classic.QueryParser;% p3 F: \5 H' z2 T8 Y
import org.apache.lucene.search.IndexSearcher;) c4 u5 n, c8 k+ r4 A l) P0 Y
import org.apache.lucene.search.Query;8 f# H& } L( s
import org.apache.lucene.search.ScoreDoc;
" [8 E5 L7 V0 _% f; Vimport org.apache.lucene.search.TopDocs;2 ?2 K: e2 t/ o# o4 a y3 X: S L2 g
import org.apache.lucene.search.highlight.Fragmenter;
3 Z& Q2 |; b( m- ^' Q" `+ \/ Oimport org.apache.lucene.search.highlight.Highlighter;
" ^7 Z, V; T: n; v) \import org.apache.lucene.search.highlight.QueryScorer;
; c* y+ H7 f3 M: z7 ]import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
) I$ S% X; t" i2 z Jimport org.apache.lucene.search.highlight.SimpleSpanFragmenter;
3 S0 E* i7 ~+ I* Mimport org.apache.lucene.store.Directory;
2 r+ g6 y9 y; I4 yimport org.apache.lucene.store.FSDirectory;
5 M: Z$ E {+ C2 Q* u, }$ G, Y0 q2 G/ y6 ^# C
public class Searcher {
" j: T; |3 V$ P/ U% n$ S3 a
. r; _2 Z5 L; Q public static void search(String indexDir,String q)throws Exception{
; g# Q. k' H9 W) T Directory dir=FSDirectory.open(Paths.get(indexDir)); F# q# m) |- V4 z
IndexReader reader=DirectoryReader.open(dir);
# }9 L6 U) w: s# B2 h0 { IndexSearcher is=new IndexSearcher(reader);+ Y/ U9 K5 V1 r) M, D+ a
// Analyzer analyzer=new StandardAnalyzer(); // 标准分词器8 h. q! w( r: e3 B1 g
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
k0 D; h" W: Q QueryParser parser=new QueryParser("desc", analyzer);0 R2 n$ ^1 q6 l. _; D/ s
Query query=parser.parse(q);
- O) @' `7 J3 ~7 v( T long start=System.currentTimeMillis();
" ^( K% Q1 Q& Z" U9 h. o1 c2 x1 z. E8 p TopDocs hits=is.search(query, 10);/ H# ~% @1 b3 A. O+ r& u( m
long end=System.currentTimeMillis();6 I. S! o% n# m' F. R
System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
- X7 s0 o, d. d" x: C, l
! A5 t; Y+ R$ F. w& Y9 Z, y% r/ Y QueryScorer scorer=new QueryScorer(query);; G3 b) f& o7 U6 o3 {5 ^( M3 v0 p
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);# Y6 }" v) \' I9 K% V& l: Q. f4 f
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
7 g: d. [, h8 c% y Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
6 }; X2 O( T& S+ ?# A: e highlighter.setTextFragmenter(fragmenter);
/ j9 F. @( s: m* J for(ScoreDoc scoreDoc:hits.scoreDocs){
9 k9 j& y. x" v Document doc=is.doc(scoreDoc.doc);# a6 l- O r- s8 _% p- B8 `' |4 @: X
System.out.println(doc.get("city"));( v1 u; ]. U k6 x* [+ n& T
System.out.println(doc.get("desc"));
6 a. _! h Q( }0 u$ i$ o6 ^) ]! v String desc=doc.get("desc");. I# A" T" F" i6 G% g& A
if(desc!=null){4 z" V# X0 \1 |: p. U. ^
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
7 C" B+ r; V0 f4 L& v- o, A6 J System.out.println(highlighter.getBestFragment(tokenStream, desc));
- c8 } S! {; X6 J }- T" C% }1 U: u w4 M1 s) W
}
E+ t1 O2 P4 u1 Z) z# | reader.close();% x8 D/ M/ X1 C5 z; L
}% k# r% o! q, I: a; L, ~
: e* m- J V9 z- x- v
public static void main(String[] args) {
8 E* {* g7 i) [+ n$ I0 r/ l% c X String indexDir="D:\\lucene6";
2 s2 a" p, K- A3 ^# r# V) n String q="南京文明";
3 d2 f+ q- t' J% S8 U+ N try {9 B( ]. v; @: R& g
search(indexDir,q);
4 J) d! Z6 ]2 d2 c } catch (Exception e) {
7 ^9 \2 e$ O4 J // TODO Auto-generated catch block* {1 {* G/ l, {. F# o* i2 h1 w
e.printStackTrace();
+ @4 K4 h5 y7 W6 l% j1 `7 X }
) M' a, v9 m6 Q" Z1 S/ O8 U }
, q. e8 h. `) b* M- [}
, i; g a0 x% T0 _. t/ Z3 s0 |: J- b7 H$ b
' a; {8 Z# P; U( w" h
* F# y" E6 r, n' i' D$ H
H; o) `7 j8 A/ f) k- B% W1 ^" E ~) F/ e2 {7 ], J( F/ S
|
|