|
1、中文分词(smartcn)
' o8 ~! r4 V X <dependency>5 c1 m5 N9 `' T/ i% _' T
<groupId>org.apache.lucene</groupId>% p* _( l0 r% J3 F5 `& O/ _% C% U! {
<artifactId>lucene-analyzers-smartcn</artifactId>
9 C6 `! ~ I' Q: O <version>5.3.1</version>
. `/ O+ M' z4 Z& v; I# Y1 i0 R </dependency>" n; F+ b5 Z5 f# T% }
4 B- h) f W) C
u9 |* e1 ?& c' Y4 L2 |9 e
4 |# T( K9 y" _+ `1 _ `( h2、高亮显示
; o! i; y3 F0 h7 V) }/ Q' ^ <dependency>" R' ~1 ~* F3 x
<groupId>org.apache.lucene</groupId>0 s" c( {5 N& V
<artifactId>lucene-highlighter</artifactId>
! u7 y+ c7 f* r. w0 R5 y A <version>5.3.1</version>; @( i! Q! P* |2 S$ X. Y! ]
</dependency>1 Z4 m% d V* `7 b2 V
; i8 n% {5 D$ u5 ~; a源码:
, I3 _# O; H2 B+ `7 e5 l2 u(1)6 n B8 Z2 v5 B/ w1 m. e( i
import java.nio.file.Paths;
$ ]3 @1 h6 g8 R$ k8 h$ I
5 ^1 }0 w! E2 p, A9 `" Rimport org.apache.lucene.analysis.Analyzer;
0 C" L5 `* O5 q" J- w0 kimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;1 P! {5 `% O, n3 \+ C5 D& i1 U
import org.apache.lucene.analysis.standard.StandardAnalyzer;+ o3 }- G1 M) U, r5 V& e7 H6 o1 s
import org.apache.lucene.document.Document;
* C. p/ Y( e6 f! g1 m- R6 @5 Ximport org.apache.lucene.document.Field;) D3 a6 m4 S* i' n2 m
import org.apache.lucene.document.IntField;/ h6 t- x3 r+ g2 T1 \- Y/ ~
import org.apache.lucene.document.StringField;8 R8 U8 \/ p. j" ]4 W9 g5 d; b
import org.apache.lucene.document.TextField;* H2 h9 H+ [& ]: r3 j
import org.apache.lucene.index.IndexWriter;$ A; K6 }1 B! e9 @
import org.apache.lucene.index.IndexWriterConfig;2 h0 s+ Y$ n- ]! X( t2 V
import org.apache.lucene.store.Directory;
2 M# Q# V p# v( ~import org.apache.lucene.store.FSDirectory;/ ?. E) H3 l8 L* r* k5 k. c
% [ m3 u0 g, epublic class Indexer {
# Z/ c# _0 I1 R0 R, d
- R: H; ?9 Q- w7 B9 h2 T private Integer ids[]={1,2,3};, R+ d6 w+ L9 `1 ^8 P6 Y
private String citys[]={"青岛","南京","上海"};
* E" r5 l6 r2 O private String descs[]={, X7 [) i5 }( ]8 x
"青岛是一个美丽的城市。",
8 m. Q* W, M; X% {& W6 t "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。",; k8 @; \9 q6 Z$ ^ ]
"上海是一个繁华的城市。"
* D V( w# ]2 O! J+ |) ~ };
% [& G& s, h, G# P( v$ l
; n/ I4 z5 U. _ h* W private Directory dir;
7 ~- E( ]+ P! Q* O/ ^ k5 @$ N# T! c' O: l+ ^ N: A7 F
/**8 X' f6 d& w1 ?# l7 h/ {; r$ e
* 获取IndexWriter实例
5 e0 u" @0 L* N9 {( B9 y5 D5 \% _ * @return
9 L/ l3 y) U+ w6 e * @throws Exception* \ o8 ~* K) w
*/
$ w4 P0 n+ h) ^ private IndexWriter getWriter()throws Exception{0 h/ J# L s: I# E% u
//Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
* ]% s3 z# N+ P; O5 e' r SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
) |3 ~, V9 g2 @8 w: K( `" T) W IndexWriterConfig iwc=new IndexWriterConfig(analyzer);0 {9 V$ C/ o6 d6 V! [6 z& G
IndexWriter writer=new IndexWriter(dir, iwc);
" \4 \' @! Y O6 j8 H/ H" [7 c; d return writer;3 P& b0 K* f0 [( e
}
9 |0 b; w6 ?1 i, T! G
! ~. c* }: E* ?& C$ O c /**+ o$ Y: R C3 P
* 生成索引) i% f) X: {, i: O0 j
* @param indexDir" ]( _9 {4 g! d& F. n) C
* @throws Exception2 d9 [# W, b1 K( a
*/
3 w' w" E% g0 S& m private void index(String indexDir)throws Exception{' L0 B& l- M# C" l7 `$ t
dir=FSDirectory.open(Paths.get(indexDir));* s* D( A8 K4 ]( B* J- o# a( ^
IndexWriter writer=getWriter();
$ F: \$ Z" K0 i for(int i=0;i<ids.length;i++){
+ T3 i* T; T, H' Q |) o& h( k Document doc=new Document();7 ~8 g2 c! i3 w, H) J b/ h1 y* E. [
doc.add(new IntField("id", ids, Field.Store.YES));/ N& [8 j3 l/ _( q B0 ~
doc.add(new StringField("city",citys,Field.Store.YES));
# ] R% Y# c; m& C/ | doc.add(new TextField("desc", descs, Field.Store.YES));/ Q4 @( |: e* |) w( i9 l
writer.addDocument(doc); // 添加文档
3 a: b' t5 A2 H }8 z8 e1 G3 \- S/ d# Z6 M0 N4 X
writer.close();
" Q2 I. I# O) \0 X6 g' L$ a4 | }
( O. b+ ?! E% l: n9 C& h
7 c7 P% `% w X& r3 {) f1 R/ h6 D3 n5 g( Q L
public static void main(String[] args) throws Exception {
" O9 b' Z) [% n% v6 E" W" C new Indexer().index("D:\\lucene6");0 A; J, w$ u& V* u$ m
}7 D) @( R0 r- J! s& D
! z* E" `& q& v) a+ j
}
) a1 z/ Y9 V! k9 T
+ f3 w8 X$ o6 k- k5 B' U5 K9 L: W C& r
7 D' b7 x% W/ I( V$ g0 r1 Q7 T3 o
(2)/ l9 g. D1 ~4 K# [4 J: M, T$ H, c
import java.io.StringReader;
5 {0 j% q; _9 c: g1 uimport java.nio.file.Paths;
1 ]- ?- H3 n: D7 E8 A8 q2 r$ L! @, c4 a2 G8 G. ~2 o
import org.apache.lucene.analysis.Analyzer;; M W7 h& V2 F. X H- ~
import org.apache.lucene.analysis.TokenStream;
2 F! Z2 S) [' P' V. r3 t! oimport org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;0 {4 p, @9 R# d4 B- e
import org.apache.lucene.analysis.standard.StandardAnalyzer;
6 `2 ]' c6 W0 s& i: G. @6 fimport org.apache.lucene.document.Document;
( m% F" i0 A8 `! y- b! v1 qimport org.apache.lucene.index.DirectoryReader;* h6 E1 A+ W: f
import org.apache.lucene.index.IndexReader;0 m& g3 j- e6 K* `7 p2 b6 |6 J
import org.apache.lucene.queryparser.classic.QueryParser;
4 _1 Y2 g3 ]& \: d0 Bimport org.apache.lucene.search.IndexSearcher;
" N: W4 C$ O6 z+ Yimport org.apache.lucene.search.Query;
7 T3 }- G, @1 O, X0 N; T7 Z$ bimport org.apache.lucene.search.ScoreDoc;
9 f, j, z, H* h7 w( d7 y: uimport org.apache.lucene.search.TopDocs;+ z5 @$ M, A. x2 s- w
import org.apache.lucene.search.highlight.Fragmenter;
; | W1 {, d! n, ^. g, o! j, ~import org.apache.lucene.search.highlight.Highlighter;" f$ K: m; q% v$ J+ K
import org.apache.lucene.search.highlight.QueryScorer;
& {1 X+ |7 L- a/ Zimport org.apache.lucene.search.highlight.SimpleHTMLFormatter;
5 `% L# v2 M( F" m) Q) A2 Q6 Limport org.apache.lucene.search.highlight.SimpleSpanFragmenter;1 s$ t/ W/ |5 I' d$ a: }
import org.apache.lucene.store.Directory;
. k0 E1 z) X+ h7 h- c* f8 W: qimport org.apache.lucene.store.FSDirectory;6 ^$ f) D9 K* h- S" N/ R8 w
9 n/ u1 v6 {# A/ |8 x$ v
public class Searcher {; y* @8 R# _( {8 V6 \& d/ t
7 H. F" t% e- n public static void search(String indexDir,String q)throws Exception{, d. O; g, a% ]
Directory dir=FSDirectory.open(Paths.get(indexDir));5 `6 N J& Y$ p7 r4 D/ y+ v( i
IndexReader reader=DirectoryReader.open(dir);
! ^, B8 }' y' \ IndexSearcher is=new IndexSearcher(reader);
% n. k5 F6 i# O; D9 @$ r // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
( G3 y* H3 N, d, U$ J SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
3 x3 k9 K z1 P; k) ^* \ QueryParser parser=new QueryParser("desc", analyzer);" R+ o% @$ ~4 P. P) Z2 u
Query query=parser.parse(q);3 e2 q3 C+ h# [, o
long start=System.currentTimeMillis();6 w* h. h; i) I+ a9 T
TopDocs hits=is.search(query, 10);$ e/ L2 Q: c9 u. x5 C& a
long end=System.currentTimeMillis();, j2 b3 ^# k' M- T
System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");2 ]& w+ e% [) U" b- E- A
1 L& Y# P ]% z' n
QueryScorer scorer=new QueryScorer(query);# b" i; k1 M5 q c) o6 U8 X
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);0 \% l8 z/ Q0 k, u$ k
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
5 G: f) b3 G( z& D Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);) M8 }9 L Q! J4 L# m7 T+ F
highlighter.setTextFragmenter(fragmenter);
$ K$ g7 \1 b4 W- V+ ~# s for(ScoreDoc scoreDoc:hits.scoreDocs){9 x- n! I) W8 M% L- L; r) @
Document doc=is.doc(scoreDoc.doc);
6 z4 \5 C- K1 ~) v* s* C System.out.println(doc.get("city"));
, [0 A2 J$ k: o7 H3 x0 O3 B% w& P System.out.println(doc.get("desc"));
5 h8 \8 W* t. _3 D' I String desc=doc.get("desc");. \, i! R: x5 Z% M" h/ D
if(desc!=null){& e$ k' e/ a, [
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
. m+ k6 j6 V7 F System.out.println(highlighter.getBestFragment(tokenStream, desc));
: Y2 h3 h1 @ _3 T5 a }
# V3 f3 S) {. a( O0 X. q: g" t }) n7 q/ ?5 ^" {: t/ G& }1 P
reader.close();
& k% l1 M& |' i' p- ] }! O% _( W9 @7 \' S( o
! W4 H6 y9 u& \4 h& i public static void main(String[] args) {- }# b, B2 X6 I2 U: {2 _
String indexDir="D:\\lucene6";
2 N2 b1 C \/ C, V) k0 ?5 k String q="南京文明";
; L u1 Q) f0 ^. K2 ? try {
0 w4 y5 V2 t9 D {! ^ search(indexDir,q);
( E/ d* B U6 J( `: y5 g } catch (Exception e) {
; t2 [1 [, o/ g- {+ I% r' P // TODO Auto-generated catch block7 l( Q3 F/ `' W7 d! x, ?% t# j. u
e.printStackTrace();- q0 ~$ J' e& Q( P4 Y% n4 M3 B
}
) K8 A: R ^9 B4 S3 { }; `) y! y6 C7 X: D
}" W! x2 r1 O7 s- @" a4 m7 Q
$ P# d- t; V# Y6 p \' r* D
1 e8 x i, {; }2 D4 W2 `2 i, k9 }1 |- ]* U
. S8 V! y2 B; G$ G
% ]8 d9 G# u! V V- j2 K |
|