|
1、中文分词(smartcn)& J0 `# u8 e& ], n
<dependency>/ ^8 c# v& p: K" L8 N: Q
<groupId>org.apache.lucene</groupId>( X$ j# e8 k: }* n
<artifactId>lucene-analyzers-smartcn</artifactId>% l/ T6 Z# b& c. b B
<version>5.3.1</version>
$ K! ]& n5 O- a% Z; I& G* \ </dependency>
- \/ a# M! o6 S! l6 \/ t9 ^+ W4 L: }- U0 l( W
5 `$ S# N' o; \, d: D
9 M- S7 A0 e! t Z* K; o$ n$ s2、高亮显示
: l: {5 b* J1 [- T# m5 k1 c <dependency>
- M9 M4 {. R7 q) N- k- Y& T <groupId>org.apache.lucene</groupId>
+ P) z( B! a. x/ |% j <artifactId>lucene-highlighter</artifactId>4 g3 {3 o3 v* w( T# D+ \; h, c6 H
<version>5.3.1</version>$ }9 t. @3 d# o$ u& `" q: e
</dependency>
) ?& O6 {% a6 L5 D7 }
4 M1 v5 y7 E& g/ `2 m2 [! Z U; Y源码:; h0 G1 q) `( U6 l0 u
(1)
& k( R& \; U0 {/ `import java.nio.file.Paths;/ }* f2 _# r G/ @! _9 y
" }6 S' e2 x$ X& ?: Iimport org.apache.lucene.analysis.Analyzer;
' [! W; i B) ^: `3 Wimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
/ l7 s9 y3 T1 }$ Himport org.apache.lucene.analysis.standard.StandardAnalyzer;
$ b3 h2 ~7 [( kimport org.apache.lucene.document.Document;% L) J8 C$ k, _1 B6 }
import org.apache.lucene.document.Field;
$ e! I- O6 `% q4 R& `( T1 E+ G7 b* }import org.apache.lucene.document.IntField;& f8 R2 L/ x1 q& Q) h2 I& e9 m
import org.apache.lucene.document.StringField;
2 m+ C9 O* [8 k6 R8 e0 m p5 f6 Oimport org.apache.lucene.document.TextField;
& g8 c' r4 _' z' O) P) ?import org.apache.lucene.index.IndexWriter;* Y# M: j6 L2 w
import org.apache.lucene.index.IndexWriterConfig;+ M8 b0 T4 i( \! O( E- K( J" | E
import org.apache.lucene.store.Directory;. `1 s* U% Q# i
import org.apache.lucene.store.FSDirectory;; q# h* a7 v$ d( s6 i1 L E) }% m
! h/ ~4 {+ ~! M* [6 Apublic class Indexer { G$ s; G) c1 Y( ]7 h
6 F/ C3 \" y+ ?3 g
private Integer ids[]={1,2,3};
% U' B# x/ ?, E" O( w7 z! _/ E private String citys[]={"青岛","南京","上海"};4 p7 M* N8 T1 }4 r2 t
private String descs[]={- Q# r' r. |5 s* R
"青岛是一个美丽的城市。",
1 A7 E( N' m2 ~! H" g "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",
# m/ R, U2 }! |& {8 I "上海是一个繁华的城市。"3 q% G K4 A# f! j9 Z9 F
};4 x6 k' I4 q; s3 e
3 _* n( J7 K9 y' J! o private Directory dir;! Q$ j/ r- N4 [5 A9 x9 T
' z/ x: b' {& G7 f5 }! P /**8 w l, t1 `+ v6 c4 x6 D
* 获取IndexWriter实例
3 L7 ^# f; u' u+ e * @return
9 A; e8 e" L3 y& E) b/ \% R * @throws Exception
8 t) X# U: `; j( x4 h5 Y */
" p/ O# E( E0 k! B; n4 G/ Z- C# s8 k private IndexWriter getWriter()throws Exception{
6 O2 N1 n8 O u" b3 H //Analyzer analyzer=new StandardAnalyzer(); // 标准分词器2 S8 g8 V# I) ]/ a# Q: J% R' i' n3 Z
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
& I! q! V% h5 t( I IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
* v! [$ n n+ [( a, j" ] IndexWriter writer=new IndexWriter(dir, iwc);' Z9 N ?8 C- a4 j4 J
return writer;+ S+ x' y$ A! O, [( C$ e
}
! ?# P( [; a, K, D$ z2 P7 z, h! H/ }( T, m" [: @# I
/**6 Q- G' I9 u) v& a+ l, \8 z
* 生成索引
. u0 Y! E# I. N/ V% j% s * @param indexDir
5 _9 b( w) A& a. Q0 G * @throws Exception8 N4 _7 \6 @6 W
*/- L, C, X$ X9 ^8 h+ @* l
private void index(String indexDir)throws Exception{+ S: c# k* l8 A' o! g
dir=FSDirectory.open(Paths.get(indexDir));. u: d0 F5 V* o, r6 \0 F0 ~
IndexWriter writer=getWriter();' q4 x! X& K3 O1 {2 x0 D
for(int i=0;i<ids.length;i++){
8 @( \: O& N b$ E3 y7 ?; } Document doc=new Document();: p) `3 A- a, z
doc.add(new IntField("id", ids, Field.Store.YES));
, ^- {6 x& Y0 B; L: P doc.add(new StringField("city",citys,Field.Store.YES));' _$ B# i; m- b( F; y2 C7 C
doc.add(new TextField("desc", descs, Field.Store.YES));4 x$ H) T7 d$ \1 W% I
writer.addDocument(doc); // 添加文档, w5 s6 j4 w# }
}- \) D* m5 E1 T( W4 [6 Z/ g1 D
writer.close();! R& c' G, c3 q
}
Q1 ]: Y6 L# i( _/ w/ T* |0 h; D: w* c* K" o) q3 N+ w
6 p: J9 n0 C% E9 G0 G* H
public static void main(String[] args) throws Exception {3 ~9 P' Y, O: N d1 i$ n
new Indexer().index("D:\\lucene6");
% K. }/ b( i# _ d }
$ a9 q5 v7 O4 u0 k, S- I# @
1 _& l0 V) j: C. G3 R}9 c% y: P# M$ d- ?# }& b$ X) z
# h- P# ^# h3 u3 m
% m4 b% h8 M! v( s# Y6 W& E" _" i
- q: s5 j) [) C; W+ ^3 {5 {* Z(2)
+ v* O" f: \ E- L1 ]3 f# Pimport java.io.StringReader;
1 F* r- ?4 [# V2 H/ N8 Gimport java.nio.file.Paths;
; v7 F; ~4 ^' f: z. r* m$ l2 w+ n3 z6 t# L0 U
import org.apache.lucene.analysis.Analyzer;5 q/ U% @; [9 I" Z" D) ]
import org.apache.lucene.analysis.TokenStream;
+ Z8 P2 Q" ?8 w8 q# r' r. Kimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
$ g# u2 w) d0 U8 uimport org.apache.lucene.analysis.standard.StandardAnalyzer;
$ V; N. V% Q( Q3 u6 P& L+ simport org.apache.lucene.document.Document;
|5 `4 M% j: b" M+ Mimport org.apache.lucene.index.DirectoryReader;% e& j. a4 C; v
import org.apache.lucene.index.IndexReader;2 Z/ ~) {% I* \& `
import org.apache.lucene.queryparser.classic.QueryParser;
+ s) @, [' e" J5 F h1 |* jimport org.apache.lucene.search.IndexSearcher;
. F. M/ Q- H* d* X9 ximport org.apache.lucene.search.Query;
# [$ H+ s6 v/ g4 }; E, x1 e" {import org.apache.lucene.search.ScoreDoc;0 M; d J4 l3 p* c8 x
import org.apache.lucene.search.TopDocs;
' [4 [+ T9 Z" o7 ]5 Jimport org.apache.lucene.search.highlight.Fragmenter;
8 f& c7 h' B: L, P9 F! himport org.apache.lucene.search.highlight.Highlighter;
1 y* h& t1 v7 v$ g2 qimport org.apache.lucene.search.highlight.QueryScorer;
( P8 z) @; v' m# Y! H+ ]import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
) p; @1 s4 B# B# e2 D Bimport org.apache.lucene.search.highlight.SimpleSpanFragmenter;: F: H( Q7 a6 }" o W5 i J6 Q
import org.apache.lucene.store.Directory;
' N: v$ ~7 a c+ j- Z( p: Q Iimport org.apache.lucene.store.FSDirectory;
$ _0 y1 f8 r3 ]- X2 Z# C/ f5 R, v; ]# O: J9 n
public class Searcher {- e; D2 U0 x& H
* j+ X4 z' q$ r, n5 Q2 t
public static void search(String indexDir,String q)throws Exception{
! _7 M P+ }' q- {/ U: F' Q s0 T Directory dir=FSDirectory.open(Paths.get(indexDir));
. Q: L9 A' e* [" ~ IndexReader reader=DirectoryReader.open(dir);
: G- A6 o8 ], } IndexSearcher is=new IndexSearcher(reader);
X$ Y5 Z& W% z! o: W // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器3 @- N& z% U, u! L
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
$ w( V7 ^ g$ l% m QueryParser parser=new QueryParser("desc", analyzer);
8 K6 u+ Y: ~8 `+ X% b2 t: @6 w Query query=parser.parse(q);7 o2 l4 @# z- `0 w$ U
long start=System.currentTimeMillis();4 a0 F9 g0 G2 r5 F
TopDocs hits=is.search(query, 10);/ m1 I |- v8 r( H- X1 R# N
long end=System.currentTimeMillis();" ]) c$ t+ s. c" M/ ^# d
System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
8 u3 M$ p Z n' q! Y. \/ \5 t
8 |! p/ w! Z7 C! h+ m8 E QueryScorer scorer=new QueryScorer(query);% I" E6 r; D" G8 _9 x9 [# P
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);: j3 Q _. x ]% V% _+ A
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
0 X( a7 e7 `7 K Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
. B# x3 B$ ]0 u& N highlighter.setTextFragmenter(fragmenter);
7 P2 u9 Y0 o- | for(ScoreDoc scoreDoc:hits.scoreDocs){
8 ?. _8 y! G6 ]; }3 L9 l: \ Document doc=is.doc(scoreDoc.doc);
: Y2 D+ g/ j' t9 r6 [' p. l2 a System.out.println(doc.get("city"));8 B8 B2 b& k5 T8 P* m7 h+ D
System.out.println(doc.get("desc"));: H6 G/ S Q& Q
String desc=doc.get("desc");
. C) V; _( t& n( v ]8 ^+ w8 W7 M if(desc!=null){; ]9 P. l# H5 \4 w, n7 `! B5 m
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));3 h7 ^, a5 L2 ?
System.out.println(highlighter.getBestFragment(tokenStream, desc));. u5 c2 z, c- L ~- {; ?
}& H9 T- o6 L; Q2 Z+ M6 @
}1 s# H1 o! B/ l! g( C3 K5 ~8 t# Z
reader.close();6 I. T4 ]' F1 G: ~8 n8 J: m
}
8 _" z) X, w n8 S3 H; Q! J. Q$ R0 B, n; J. I+ l
public static void main(String[] args) {
1 Q- y7 ?- f# n( J* a) h String indexDir="D:\\lucene6";
. t' m- m: e! [" q' x5 P8 J String q="南京文明";/ ?1 \6 R+ R1 K9 r& U4 |
try {
! g7 W! ]; B% {( M& {$ l/ u9 I3 H# Q7 S search(indexDir,q);
9 F+ v3 W$ Z2 [! f. I z } catch (Exception e) {0 P: L! I( q3 |# f1 y
// TODO Auto-generated catch block
. U. I) H0 A [! D- q8 T5 G- k e.printStackTrace();
# Y3 b# @% s9 p9 O, | }
9 A8 q( v9 a* A9 P* ~/ U: y; o }3 u6 E7 ]; C3 p
}4 y& _; |+ o7 h3 Z9 d9 f* u
: @. {% S: d) N4 K: ]! E& V$ e4 S& Y/ ~* ?. A ~1 [+ Q' `4 d# u$ g
/ f5 G$ c& V& s0 A! _0 p5 l1 c; H, Q# @
/ d& C4 g1 ]* w1 V |
|