Ajay Patel
Ajay Patel

Reputation: 791

Solr French Language Schema

Can anyone please share the best solr schema for French language data? I have already implemented this for my current project, but it does not give proper data for french text.

My schema.xml:

<schema name="example core zero" version="1.1">
  <types>
    <!-- boolean type: "true" or "false" -->
    <fieldtype name="string"  class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
    <!--
      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.

      These fields support doc values, but they require the field to be
      single-valued and either be required or have a default value.
    -->
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="0">
      <analyzer type="index">
     <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.FrenchLightStemFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="French" />
        <filter class="solr.ASCIIFoldingFilterFactory"/>
      </analyzer>
      <analyzer type="query">
     <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.FrenchLightStemFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="French" />
 <filter class="solr.ASCIIFoldingFilterFactory"/>     
    </analyzer>
    </fieldType>
    <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="sku" class="solr.TextField" omitNorms="false">
            <analyzer type="index">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="\s" replacement=""/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="[^a-zA-Z0-9]" replacement=""/>
                <filter class="solr.NGramFilterFactory" maxGramSize="30"/>
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="\s" replacement=""/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="[^a-zA-Z0-9]" replacement=""/>
            </analyzer>
        </fieldType>
        <fieldType name="exact" class="solr.TextField" omitNorms="false">
            <analyzer type="index">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
            </analyzer>
        </fieldType>

   </types>

 <fields>   
  <!-- general -->
  <field name="id"        type="string"   indexed="true"  stored="true"  multiValued="false" required="true"/>
  <field name="type"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="name"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="core0"     type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="_version_" type="long"     indexed="true"  stored="true"/>
  <field name="sku"       type="sku"     indexed="true"  stored="true"/>

   <!-- Dynamic field definitions allow using convention over configuration
       for fields via the specification of patterns to match field names. 
       EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
       RESTRICTION: the glob-like pattern in the name attribute must have
       a "*" only at the start or the end.  -->

   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_is" type="int"    indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true" />
   <dynamicField name="*_ss" type="string"  indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_ls" type="long"   indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_t"  type="text_general"    indexed="true"  stored="true"/>
   <dynamicField name="*_txt" type="text_general"   indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true" stored="true"/>
   <dynamicField name="*_bs" type="boolean" indexed="true" stored="true"  multiValued="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_fs" type="float"  indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
   <dynamicField name="*_ds" type="double" indexed="true"  stored="true"  multiValued="true"/>

   <dynamicField name="*_dt"  type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_dts" type="date"    indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_exact"  type="exact"    indexed="true"  stored="true"/>

   <!-- uncomment the following to ignore any fields that don't already match an existing 
        field name or dynamic field, rather than reporting them as an error. 
        alternately, change the type="ignored" to some other type e.g. "text" if you want 
        unknown fields indexed and/or stored by default --> 
   <!--dynamicField name="*" type="ignored" multiValued="true" /-->

 </fields>

 <!-- field to use to determine and enforce document uniqueness. -->
 <uniqueKey>id</uniqueKey>

 <!-- field for the QueryParser to use when an explicit fieldname is absent -->
 <defaultSearchField>name</defaultSearchField>

 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>
</schema>

Upvotes: 0

Views: 2400

Answers (1)

femtoRgon
femtoRgon

Reputation: 33341

You are stemming twice. Stemmers are not designed to enhance one another, using two can only get you unpredictable results. Pick either FrenchLightStemFilter or the snowball filter, not both.

If you aren't sure which you want, I'd say start with the FrenchLightStemFilter, and if you find it's too conservative, try the snowball filter.

You could also take the simple route, and just try using the pre-defined FrenchAnalyzer, like:

<analyzer class="org.apache.lucene.analysis.fr.FrenchAnalyzer"/>

Doesn't have an ASCIIFoldingFilter, though I don't know whether or not that's really necessary after the stemmer.

Upvotes: 1

Related Questions