博客统计信息

用户名:william_xu
文章数:36
评论数:17
访问量:9916
无忧币:204
博客积分:402
博客等级:3
注册日期:2011-09-26

我的技术圈(3)

更多>>
Nutch集成Solr中文分词Schema
2012-02-07 09:08:09
原创作品,允许转载,转载时请务必以超链接形式标明文章 原始出处 、作者信息和本声明。否则将追究法律责任。http://williamx.blog.51cto.com/3629295/773815

 <?xml version="1.0" encoding="UTF-8" ?>

<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
license agreements. See the NOTICE file distributed with this work for additional 
information regarding copyright ownership. The ASF licenses this file to 
You under the Apache License, Version 2.0 (the "License"); you may not use 
this file except in compliance with the License. You may obtain a copy of 
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
by applicable law or agreed to in writing, software distributed under the 
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
OF ANY KIND, either express or implied. See the License for the specific 
language governing permissions and limitations under the License. -->
<!-- Description: This document contains Solr 3.1 schema definition to be 
used with Solr integration currently build into Nutch. See https://issues.apache.org/jira/browse/NUTCH-442 
https://issues.apache.org/jira/browse/NUTCH-699 https://issues.apache.org/jira/browse/NUTCH-994 
https://issues.apache.org/jira/browse/NUTCH-997 and http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/ 
example/solr/conf/schema.xml?view=markup for more info. -->
<schema name="nutch" version="1.3">
<types>
<fieldType name="string" class="solr.StrField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="long" class="solr.TrieLongField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="float" class="solr.TrieFloatField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="date" class="solr.TrieDateField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
 
<fieldType name="cache_text" class="solr.TextField"
positionIncrementGap="100">
</fieldType>
 
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"
mode="complex" dicPath="dic" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"
mode="complex" dicPath="dic" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="url" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" />
</analyzer>
</fieldType>
</types>
<fields>
<field name="id" type="string" stored="true" indexed="true" />
 
<!-- core fields -->
<field name="segment" type="string" stored="true" indexed="false" />
<field name="digest" type="string" stored="true" indexed="false" />
<field name="boost" type="float" stored="true" indexed="false" />
 
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true" />
<field name="site" type="string" stored="false" indexed="true" />
<field name="url" type="url" stored="true" indexed="true"
required="true" />
<field name="content" type="text" stored="false" indexed="true" />
<field name="title" type="text" stored="true" indexed="true" />
<field name="cache" type="string" stored="true" indexed="false" />
<field name="cache_content" type="cache_text" stored="true"
indexed="false" />
<field name="tstamp" type="date" stored="true" indexed="true" />
 
<!-- fields for index-anchor plugin -->
<field name="anchor" type="string" stored="true" indexed="true"
multiValued="true" />
 
<!-- fields for index-more plugin -->
<field name="type" type="string" stored="true" indexed="true"
multiValued="true" />
<field name="contentLength" type="long" stored="true" indexed="false" />
<field name="lastModified" type="date" stored="true" indexed="false" />
<field name="date" type="date" stored="true" indexed="true" />
 
<!-- fields for languageidentifier plugin -->
<field name="lang" type="string" stored="true" indexed="true" />
 
<!-- fields for subcollection plugin -->
<field name="subcollection" type="string" stored="true" indexed="true"
multiValued="true" />
 
<!-- fields for feed plugin (tag is also used by microformats-reltag) -->
<field name="author" type="string" stored="true" indexed="true" />
<field name="tag" type="string" stored="true" indexed="true"
multiValued="true" />
<field name="feed" type="string" stored="true" indexed="true" />
<field name="publishedDate" type="date" stored="true" indexed="true" />
<field name="updatedDate" type="date" stored="true" indexed="true" />
 
<!-- fields for creativecommons plugin -->
<field name="cc" type="string" stored="true" indexed="true"
multiValued="true" />
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>content</defaultSearchField>
<solrQueryParser defaultOperator="OR" />
</schema>
 

本文出自 “果壳中的宇宙” 博客,请务必保留此出处http://williamx.blog.51cto.com/3629295/773815

分享至
更多
一键收藏,随时查看,分享好友!
0人
了这篇文章
类别:搜索技术技术圈()┆阅读()┆评论() ┆ 推送到技术圈返回首页

文章评论

 
 

发表评论            

【技术门诊】专家解析:软考重点难点及应试技巧
昵  称:
登录  快速注册
验证码:

请点击后输入验证码博客过2级,无需填写验证码

内  容: