|
1、中文分词(smartcn)
( K0 c+ n- J8 c) H <dependency>9 y3 d( t6 M1 y! F+ F5 i
<groupId>org.apache.lucene</groupId>
Z$ E9 E# I) z/ F. G <artifactId>lucene-analyzers-smartcn</artifactId>
8 O$ F, u+ h/ p- O7 n- f* e <version>5.3.1</version>: ~0 e& n' d" ^6 V( y! B, c
</dependency>
) p+ S$ v, \$ G' U3 w& w' w0 W
" y4 r) x/ C7 T+ R1 T9 {' s0 t+ m7 w% r( i Q6 Q8 u( H, `
. p, O9 Y' d! t: A$ F: z; \1 j; `8 y1 F
2、高亮显示
$ W$ r. H, d# u" R7 L <dependency>
$ A; ]9 t& M+ I5 g0 p/ u. J <groupId>org.apache.lucene</groupId>: r& P: M+ M' B7 o
<artifactId>lucene-highlighter</artifactId>1 g8 u! g4 D) |8 f, @3 D) ]
<version>5.3.1</version>
. F& N9 h' Q+ Z/ [' j </dependency>
$ b: H+ Z* g6 I% E# B+ r9 i2 x# M0 l; W1 {* l* ~: f; z
源码:; E$ F2 E% E. t2 O, g
(1)9 U; y q$ M4 ^. L& u: d3 L
import java.nio.file.Paths;! }1 K. P" v" U0 A o f
" ]: S" c$ w+ |4 Y/ U& @' d
import org.apache.lucene.analysis.Analyzer;7 F6 ?* y$ _/ R$ j% S" S
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
4 s& a' J( Y" L$ ?0 ^$ zimport org.apache.lucene.analysis.standard.StandardAnalyzer;
v6 e7 x# b6 u: x$ Rimport org.apache.lucene.document.Document;4 P7 b u; a# Y+ u4 f1 s; Y
import org.apache.lucene.document.Field;
' p9 }7 l; \/ U2 M4 Q9 f3 himport org.apache.lucene.document.IntField;
' o! k1 x0 p5 t8 i( f$ u# Y% ^import org.apache.lucene.document.StringField;
* K7 @$ A# {5 c( Ximport org.apache.lucene.document.TextField;; d/ K ]. `% b/ ~( z7 K" K
import org.apache.lucene.index.IndexWriter;
_8 C, t1 n8 G- L2 n4 ^import org.apache.lucene.index.IndexWriterConfig;* o- R" S2 y) q1 R0 N$ R, ?
import org.apache.lucene.store.Directory;6 Q% z; _# n4 ]0 H$ u7 u- {
import org.apache.lucene.store.FSDirectory;% R( R2 d s; U9 c u
5 T% p5 U4 |8 c- |& }5 N
public class Indexer {
4 W3 H7 d5 N) ]0 ?
]7 C+ k7 X- }) V5 a private Integer ids[]={1,2,3};$ D- ~: n: m/ ^& {
private String citys[]={"青岛","南京","上海"};1 a k& c" z/ ^" F" J* I `6 e
private String descs[]={
+ h# {# @. o' i5 u. v "青岛是一个美丽的城市。",
7 { l! c8 ^( x "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",) Z! q9 x% ]" q( \
"上海是一个繁华的城市。"" T/ A9 {& Y9 `% m6 h
};: M1 K$ `% a) [+ z, i) s
' n: ]5 |$ n; F2 i: E: s private Directory dir;' U, ~/ a7 Y$ X8 N8 E8 Y! D" R. [
. T$ G" s3 i C8 b
/**
m6 @) g8 a" _! G, h' j * 获取IndexWriter实例
5 q# \* y. J9 {+ A8 y: n/ I" X * @return1 s" P4 U* z' Z4 g, `0 I/ k/ t
* @throws Exception- [5 `! x; H/ D( H5 {; }6 P
*/! @2 C5 m8 L& y- z9 G4 ~" \$ D( ~
private IndexWriter getWriter()throws Exception{0 N1 P0 h( Z( V0 S- v* I
//Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
1 {( A+ R( @: l( [ SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
7 g) F% N7 _, J( u" ]7 X! { IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
! C2 k+ f/ U3 A6 u IndexWriter writer=new IndexWriter(dir, iwc);
. E" g& h* u$ R7 T return writer;
+ X3 V' p+ A2 ?% c* c }
+ E9 @7 M/ h: l2 d4 L+ T! @- u
9 z+ r& F) c( w* A: B9 [) |* } /**5 A# q/ X6 ]: `9 E
* 生成索引
1 y, A* H% h+ o" i: }* [3 e * @param indexDir! B Y4 u3 o( G$ O1 |6 h! Q$ c& l& B
* @throws Exception8 F0 `$ z. s9 E1 J
*/; H- F( J- c6 [1 p7 Z" t. e2 d
private void index(String indexDir)throws Exception{
+ b3 u* S5 L# g" B+ ~) L4 V/ L L dir=FSDirectory.open(Paths.get(indexDir));
7 s( I/ V3 ~1 M; {2 ~7 ^1 \# { IndexWriter writer=getWriter();
2 o; e4 M u1 e" [1 P% ?7 w for(int i=0;i<ids.length;i++){
, Q! g# x- D6 F% S' L6 o Document doc=new Document();2 e$ D" T! a3 ^
doc.add(new IntField("id", ids, Field.Store.YES));0 G- D1 w5 b; j" o/ X3 }' Z
doc.add(new StringField("city",citys,Field.Store.YES));
$ B" u, b! f5 J" d- Q doc.add(new TextField("desc", descs, Field.Store.YES));8 n) o% N# S8 Y2 O
writer.addDocument(doc); // 添加文档
: j% }1 \5 G4 g! ` p3 R6 }2 f) o }
0 L3 V1 m7 R# J) Y+ \8 f! r* l writer.close();
' T+ {7 V) K0 x: b0 Y$ [7 z }
: O2 n( ~) N! o8 Z+ o2 `6 A c( B1 |- N& B1 T+ m0 o. R
1 z* h" m6 ?! J, E+ f public static void main(String[] args) throws Exception {
2 D$ Y/ k; @+ ~ new Indexer().index("D:\\lucene6");8 Q) [1 i) ] |! X2 d
}# b# p2 i1 k; _8 h4 k
O9 Z+ f2 v5 O6 ~, i3 P}/ B( X( _- ? e/ D( }, i4 l- g" T
6 f9 a2 n% x* r/ o; E
4 w6 a* w2 Q- P3 T0 ?# Z
# K' y0 W; p- |( n5 k8 W% n7 g: Z
(2)! a3 {+ b9 `# o' h6 j' \2 S
import java.io.StringReader;
0 A: j+ e7 I0 F% Himport java.nio.file.Paths;2 B; B; I; s A7 Z/ s$ j; g* |
% h0 u5 o4 h" B1 P F
import org.apache.lucene.analysis.Analyzer;- }, A3 P, f6 d
import org.apache.lucene.analysis.TokenStream;0 M$ O9 H" P; f4 b) t2 _6 m h( m
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;3 B4 @2 R; R L, g* x U. t
import org.apache.lucene.analysis.standard.StandardAnalyzer;2 Q: q8 l' b7 _7 ?( F
import org.apache.lucene.document.Document;9 ^- R1 m$ F0 [5 }7 F
import org.apache.lucene.index.DirectoryReader;
- f( e* ~- g+ r ]0 h. t7 Z. Eimport org.apache.lucene.index.IndexReader;8 R/ |/ i8 Z' S
import org.apache.lucene.queryparser.classic.QueryParser;" i+ I; m3 M. x# Q4 ]
import org.apache.lucene.search.IndexSearcher;# A& d( F9 Y4 u+ |( z% d5 m& {1 _
import org.apache.lucene.search.Query;1 R: O9 y" y& c( X. y7 ?* L) A
import org.apache.lucene.search.ScoreDoc;
. E. h; |' s; i; f3 d- Gimport org.apache.lucene.search.TopDocs;
( }$ T2 c( ~+ `$ }' {! P' Qimport org.apache.lucene.search.highlight.Fragmenter;
% o+ z+ A& h8 R, R! d- w, U& N, h/ nimport org.apache.lucene.search.highlight.Highlighter;
) _3 ?6 \. ^6 G. Mimport org.apache.lucene.search.highlight.QueryScorer;1 A6 _3 {) H8 u2 M: L
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;8 [) z# u! W0 b$ n6 S' [$ @7 a8 t
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;2 @3 e1 b3 K. k4 t
import org.apache.lucene.store.Directory;. T$ s3 _ k8 ]2 B
import org.apache.lucene.store.FSDirectory;: r6 T I! S. h+ {0 k1 R
. q* ]9 K2 e kpublic class Searcher {4 W% c; i1 R1 n/ o, m
6 L1 d" v4 y) }& @* R. c0 x, { public static void search(String indexDir,String q)throws Exception{; y9 F9 I9 }* q! }
Directory dir=FSDirectory.open(Paths.get(indexDir));
; y! g1 y8 P& J# I6 j. o+ N IndexReader reader=DirectoryReader.open(dir);! C2 L/ _# f' I
IndexSearcher is=new IndexSearcher(reader);
: \4 [# A& C1 o* M // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
: H; ?5 x+ }. X! [/ T! R! w4 N+ ^; c SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
1 g/ f! E. ]* Z9 }; I QueryParser parser=new QueryParser("desc", analyzer);6 I2 k0 l( V8 Z4 a; K& j( t
Query query=parser.parse(q);
& l7 T d- g4 g d- C long start=System.currentTimeMillis();, Y4 K$ B: V# m7 e2 B7 b) s
TopDocs hits=is.search(query, 10);
6 ?4 S8 y( p9 B0 O long end=System.currentTimeMillis();
5 U& N" V6 A( @* J2 M2 z E System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
" K0 G8 Z. ~' \& S
9 w& ^7 B h# q0 I' i QueryScorer scorer=new QueryScorer(query);
+ [! D5 M( k6 q% y Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
3 B2 J$ {9 {. M" q! ~! B. E8 S SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
! a [3 I3 w L Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);5 K/ l! ]" u! g
highlighter.setTextFragmenter(fragmenter);
8 u5 f* @0 [# c7 l; o for(ScoreDoc scoreDoc:hits.scoreDocs){1 m% f' A4 \9 C7 w9 U, R, j
Document doc=is.doc(scoreDoc.doc);: s. C4 F( t2 u* g' F
System.out.println(doc.get("city"));. f `9 j, ^' {. l; P
System.out.println(doc.get("desc"));
, ?( D6 j4 @7 v% G" C' q String desc=doc.get("desc");
$ C5 Y2 d& k0 ]# J$ N" S if(desc!=null){
. _- h% L+ z9 q# ^ TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
& v4 O( C% V. @2 B* u- F. @. M System.out.println(highlighter.getBestFragment(tokenStream, desc));0 a3 o" U" p3 k( D# i5 r! w
}
8 J; R) B1 l! i1 i }
0 X( b& M- l9 N) @! M reader.close();. U+ q- x" x/ M; g) o
}
: D0 s m1 F6 |- S5 s. M! x1 b3 u% f8 a! p: G
public static void main(String[] args) {# D: c' y; _5 u5 D+ c/ F
String indexDir="D:\\lucene6";( I4 t7 u- \8 z; z. f
String q="南京文明";
$ f5 e- W1 o \5 e try {
' G% Z& d- f+ K* V, `( u search(indexDir,q);
8 }2 ^! E/ U" e3 F( W: J+ _* W } catch (Exception e) {8 ]0 ?" q4 b9 Q+ \4 O
// TODO Auto-generated catch block
. c, {: Q2 ~5 m e.printStackTrace();4 m5 X7 Z9 y" ]" B
}
& ]/ @4 W% m) a! u2 M) v }
4 t$ g& G# y1 x* s! u% |3 l& @}
! t( A2 U7 ], m m+ m* f: R* y( C( }/ I4 n# k& H
9 a! o& A) I8 u0 ]( J. s' y; k; \* C/ U* K1 _3 ?. f4 ]( [
$ A2 l' U4 x: {# G! h
+ b. c' A' G4 l \ |
|