Reputation: 1
To import data from my database into Apache Solr, I am using the DataImportHandler. The integration of the DataImportHandler was successful, as well as the indexing seems so. But the "Total documents processed" is 0 and I can't find errors in the logs.
I use Apache Solr 3.5 with Drupal and a lot of items are already indexed. So, the schema.xml already has many fields. With the DataImportHandler I want to add some more content and therefore, some more fields. For testing, I have a test database with one column "id" (primary key) and another column "test_name" (text).
In other postings, I often saw similar issues, but nothing could help me, so I hope, you can. I don't know whether there's something wrong with my field matching or if it's another problem.
This is my dataconfig.xml:
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" encoding="UTF-8"
user="user" password="password" batchSize="-1"/>
<entity pk="id" query="SELECT `id` AS ID,`test_name` AS TEST_NAME FROM `test`">
<field column="ID" name="id" />
<field column="TEST_NAME" name="test_name" />
My schema.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="drupal-3.0-beta9-solr3" version="1.3">
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldtype name="binary" class="solr.BinaryField"/>
<!-- numeric field types that can be sorted, but are not optimized for range queries -->
<fieldType name="integer" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
<!-- A Trie based date field for faster date range queries and date faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
<!-- A text field that only splits on whitespace for exact matching of words -->
<fieldType name="text_ws" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
<filter class="solr.WordDelimiterFilterFactory"
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="german-common-nouns.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="35" side="front"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<analyzer type="query">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<!-- An unstemmed text field - good if one does not know the language of the field -->
<fieldType name="text_und" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
<filter class="solr.LowerCaseFilterFactory"/>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
<filter class="solr.WordDelimiterFilterFactory"
<filter class="solr.LowerCaseFilterFactory"/>
<fieldType name="edge_n2_kw_text" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="25" />
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<!-- Setup simple analysis for spell checking -->
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.LengthFilterFactory" min="4" max="20" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
<fieldType name="sortString" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!-- A random sort type -->
<fieldType name="rand" class="solr.RandomSortField" indexed="true" />
<fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" />
<fieldType name="point" class="solr.PointType" dimension="2" subFieldType="tdouble"/>
<fieldType name="location" class="solr.LatLonType" subFieldType="tdouble"/>
<fieldtype name="geohash" class="solr.GeoHashField"/>
<field name="id" type="string" indexed="true" stored="true" required="true" />
<!-- entity_id is the numeric object ID, e.g. Node ID, File ID -->
<field name="entity_id" type="long" indexed="true" stored="true" />
<!-- entity_type is 'node', 'file', 'user', or some other Drupal object type -->
<field name="entity_type" type="string" indexed="true" stored="true" required="true" />
<!-- bundle is a node type, or as appropriate for other entity types -->
<field name="bundle" type="string" indexed="true" stored="true"/>
<field name="bundle_name" type="string" indexed="true" stored="true"/>
<field name="site" type="string" indexed="true" stored="true"/>
<field name="hash" type="string" indexed="true" stored="true"/>
<field name="url" type="string" indexed="true" stored="true"/>
<!-- label is the default field for a human-readable string for this entity (e.g. the title of a node) -->
<field name="label" type="text" indexed="true" stored="true" termVectors="true" omitNorms="true"/>
<!-- The string version of the title is used for sorting -->
<copyField source="label" dest="sort_label"/>
<!-- content is the default field for full text search - dump crap here -->
<field name="content" type="text" indexed="true" stored="true" termVectors="true"/>
<field name="teaser" type="text" indexed="false" stored="true"/>
<field name="path" type="string" indexed="true" stored="true"/>
<field name="path_alias" type="text" indexed="true" stored="true" termVectors="true" omitNorms="true"/>
<!-- new fields for grouping pdfs -->
<field name="article_group" type="string" indexed="true" stored="true" termVectors="true" multiValued="false"/>
<field name="pdf_path" type="string" indexed="true" stored="true"/>
<copyField source="path" dest="article_group"/>
<field name="tid" type="long" indexed="true" stored="true" multiValued="true"/>
<field name="test_name" type="text" indexed="true" stored="true" termVectors="true"/>
<field name="taxonomy_names" type="text" indexed="true" stored="false" termVectors="true" multiValued="true" omitNorms="true"/>
<!-- Copy terms to a single field that contains all taxonomy term names -->
<copyField source="tm_vid_*" dest="taxonomy_names"/>
<!-- Here, default is used to create a "timestamp" field indicating
when each document was indexed.-->
<field name="timestamp" type="tdate" indexed="true" stored="true" default="NOW" multiValued="false"/>
<!-- This field is used to build the spellchecker index -->
<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="label" dest="spell"/>
<copyField source="content" dest="spell"/>
<!-- A set of fields to contain text extracted from HTML tag contents which we
can boost at query time. -->
<dynamicField name="tags_*" type="text" indexed="true" stored="false" omitNorms="true"/>
<!-- For 2 and 3 letter prefix dynamic fields, the 1st letter indicates the data type and
the last letter is 's' for single valued, 'm' for multi-valued -->
<!-- We use long for integer since 64 bit ints are now common in PHP. -->
<dynamicField name="is_*" type="long" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="im_*" type="long" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="iss_*" type="slong" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="ism_*" type="slong" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="ss_*" type="string" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="sm_*" type="string" indexed="true" stored="true" multiValued="true"/>
<!-- Normal text fields are for full text - the relevance of a match depends on the length of the text -->
<dynamicField name="ts_*" type="text" indexed="true" stored="true" multiValued="false" termVectors="true"/>
<dynamicField name="tm_*" type="text" indexed="true" stored="true" multiValued="true" termVectors="true"/>
<!-- Unstemmed text fields for full text - the relevance of a match depends on the length of the text -->
<dynamicField name="tus_*" type="text_und" indexed="true" stored="true" multiValued="false" termVectors="true"/>
<dynamicField name="tum_*" type="text_und" indexed="true" stored="true" multiValued="true" termVectors="true"/>
<!-- These text fields omit norms - useful for extracted text like taxonomy_names -->
<dynamicField name="tos_*" type="text" indexed="true" stored="true" multiValued="false" termVectors="true" omitNorms="true"/>
<dynamicField name="tom_*" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" omitNorms="true"/>
<!-- Special-purpose text fields -->
<dynamicField name="tes_*" type="edge_n2_kw_text" indexed="true" stored="true" multiValued="false" omitTermFreqAndPositions="true" />
<dynamicField name="tem_*" type="edge_n2_kw_text" indexed="true" stored="true" multiValued="true" omitTermFreqAndPositions="true" />
<dynamicField name="tws_*" type="text_ws" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="twm_*" type="text_ws" indexed="true" stored="true" multiValued="true"/>
<!-- trie dates are preferred, so give them the 2 letter prefix -->
<dynamicField name="ds_*" type="tdate" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="dm_*" type="tdate" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="bm_*" type="boolean" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="bs_*" type="boolean" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="its_*" type="tlong" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="itm_*" type="tlong" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="fts_*" type="tfloat" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="ftm_*" type="tfloat" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="pts_*" type="tdouble" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="ptm_*" type="tdouble" indexed="true" stored="true" multiValued="true"/>
<!-- Binary fields can be populated using base64 encoded data. Useful e.g. for embedding
a small image in a search result using the data URI scheme -->
<dynamicField name="xs_*" type="binary" indexed="false" stored="true" multiValued="false"/>
<dynamicField name="xm_*" type="binary" indexed="false" stored="true" multiValued="true"/>
<!-- In rare cases a sfloat rather than tfloat is needed for sortMissingLast -->
<dynamicField name="fss_*" type="sfloat" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="fsm_*" type="sfloat" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="pss_*" type="sdouble" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="psm_*" type="sdouble" indexed="true" stored="true" multiValued="true"/>
<!-- In rare cases a date rather than tdate is needed for sortMissingLast -->
<dynamicField name="dds_*" type="date" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="ddm_*" type="date" indexed="true" stored="true" multiValued="true"/>
<!-- In case a 32 bit int is really needed, we provide these fields. 'h' is mnemonic for 'half word', i.e. 32 bit on 64 arch -->
<dynamicField name="hs_*" type="integer" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="hm_*" type="integer" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="hss_*" type="sint" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="hsm_*" type="sint" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="hts_*" type="tint" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="htm_*" type="tint" indexed="true" stored="true" multiValued="true"/>
<!-- Begin added fields to use features in Solr 3.4+ -->
<dynamicField name="points_*" type="point" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="pointm_*" type="point" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="locs_*" type="location" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="locm_*" type="location" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="geos_*" type="geohash" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="geom_*" type="geohash" indexed="true" stored="true" multiValued="true"/>
<!-- End added fields for Solr 3.4+ -->
<!-- Sortable version of the dynamic string field -->
<dynamicField name="sort_*" type="sortString" indexed="true" stored="false"/>
<copyField source="ss_*" dest="sort_*"/>
<!-- A random sort field -->
<dynamicField name="random_*" type="rand" indexed="true" stored="true"/>
<!-- This field is used to store access information (e.g. node access grants), as opposed to field data -->
<dynamicField name="access_*" type="integer" indexed="true" stored="false" multiValued="true"/>
<!-- The following causes solr to ignore any fields that don't already match an existing
field name or dynamic field, rather than reporting them as an error.
Alternately, change the type="ignored" to some other type e.g. "text" if you want
unknown fields indexed and/or stored by default -->
<dynamicField name="*" type="ignored" multiValued="true" />
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="AND"/>
I have both a uniquekey (id) and a defaultSearchField.
My output:
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">8</int>
<lst name="initArgs">
<lst name="defaults">
<str name="config">dataconfig.xml</str>
<str name="command">full-import</str>
<str name="status">idle</str>
<str name="importResponse"/>
<lst name="statusMessages">
<str name="Total Requests made to DataSource">1</str>
<str name="Total Rows Fetched">3</str>
<str name="Total Documents Skipped">0</str>
<str name="Full Dump Started">2013-11-14 14:24:04</str>
<str name="Total Documents Processed">0</str>
<str name="Total Documents Failed">3</str>
<str name="Time taken ">0:0:0.45</str>
<str name="WARNING">
This response format is experimental. It is likely to change in the future.
I already tried a lot of different things and nothing worked.. Please help me!
Upvotes: 0
Views: 2526
Reputation: 21
You're missing the required fields. required fields should always been used/assigned.
Upvotes: 1
Reputation: 1790
Please check the required fields in your schema.xml because Solr do not process a document which is missing a required fieald.(that's why you get rows fetched = # but total documents processed = 0 )
So your mistake here is that you're defining two required fields (id, entity_type) like below:
<field name="id" type="string" indexed="true" stored="true" **required="true"** />
<field name="entity_type" type="string" indexed="true" stored="true" **required="true"** />
and here in your entity you're just using the 'id' field, but 'entity_type' wasn't used:
<entity pk="id" query="SELECT `id`,`test_name` FROM `test`">
<field column="id" name="id" />
<field column="test_name" name="test_name" />
<!-- the other required field (entity_type) is missing here -->
so either you index this field 'entity_type' in your entity,
or in your case you should remove the required option from its definition in the schema.xml
Upvotes: 2
Reputation: 462
In schema.xml under the field tag, you need to keep only those fields which are present in your data-config.
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="test_name" type="string" indexed="true" stored="true" required="true" />
And to keep it simple, try to modify part of your data-config like this
<entity pk="id" query="SELECT `id`,`test_name` FROM `test`">
<field column="id" name="id" />
<field column="test_name" name="test_name" />
Upvotes: 0