Reputation: 9
I have added Tika as a reference to my StormCrawler implementation and that enables to fetch the PDF documents in the crawl. But, the Title, Authors and other properties don't get parsed. I have tried with different combinations to 'index.md.mapping:' and added the corresponding properties to ES_IndexInit, but the content field in Kibana (index) for PDF's documents is always empty. Everything works for HTML pages. Can you please help with some pointers, if I am missing something or I can look at an example?
es-crawler.flux:
name: "crawler"
includes: - resource: true file: "/crawler-default.yaml" override: false
- resource: false
file: "crawler-conf.yaml"
override: true
- resource: false
file: "es-conf.yaml"
override: true
spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout" parallelism: 10
bolts: - id: "partitioner" className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt" parallelism: 1 - id: "fetcher" className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt" parallelism: 1 - id: "sitemap" className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt" parallelism: 1 - id: "parse" className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt" parallelism: 5 - id: "index" className: "com.digitalpebble.stormcrawler.elasticsearch.bolt.IndexerBolt" parallelism: 1 - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" parallelism: 1 - id: "status_metrics" className: "com.digitalpebble.stormcrawler.elasticsearch.metrics.StatusMetricsBolt" parallelism: 4 - id: "redirection_bolt" className: "com.digitalpebble.stormcrawler.tika.RedirectionBolt" parallelism: 1 - id: "parser_bolt" className: "com.digitalpebble.stormcrawler.tika.ParserBolt" parallelism: 1
streams: - from: "spout" to: "partitioner" grouping: type: SHUFFLE
from: "spout" to: "status_metrics" grouping: type: SHUFFLE
from: "partitioner" to: "fetcher" grouping: type: FIELDS args: ["key"]
from: "fetcher" to: "sitemap" grouping: type: LOCAL_OR_SHUFFLE
from: "sitemap" to: "parse" grouping: type: LOCAL_OR_SHUFFLE
from: "parse" to: "index" grouping: type: LOCAL_OR_SHUFFLE
from: "fetcher" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"
from: "sitemap" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"
from: "parse" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"
from: "index" to: "status" grouping: type: FIELDS args: ["url"] streamId: "status"
from: "parse" to: "redirection_bolt" grouping: type: LOCAL_OR_SHUFFLE
from: "redirection_bolt" to: "parser_bolt" grouping: type: LOCAL_OR_SHUFFLE
from: "redirection_bolt" to: "index" grouping: type: LOCAL_OR_SHUFFLE
from: "parser_bolt" to: "index" grouping: type: LOCAL_OR_SHUFFLE
es-injector.flux:
name: "injector"
includes: - resource: true file: "/crawler-default.yaml" override: false
- resource: false
file: "crawler-conf.yaml"
override: true
- resource: false
file: "es-conf.yaml"
override: true
- resource: false
file: "injection-conf.yaml"
override: true
components: - id: "scheme" className: "com.digitalpebble.stormcrawler.util.StringTabScheme" constructorArgs: - DISCOVERED
spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.spout.FileSpout" parallelism: 1 constructorArgs: - "." - "seeds.txt" - ref: "scheme"
bolts: - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" parallelism: 1 - id: "parser_bolt" className: "com.digitalpebble.stormcrawler.tika.ParserBolt" parallelism: 1
streams: - from: "spout" to: "status" grouping: type: FIELDS args: ["url"]
pom.xml:
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>xyz.com</groupId>
<artifactId>search</artifactId>
<version>search1.0</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.3.2</version>
<executions>
<execution>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
<configuration>
<executable>java</executable>
<includeProjectDependencies>true</includeProjectDependencies>
<includePluginDependencies>false</includePluginDependencies>
<classpathScope>compile</classpathScope>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.3.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.storm.flux.Flux</mainClass>
<manifestEntries>
<Change></Change>
<Build-Date></Build-Date>
</manifestEntries>
</transformer>
</transformers>
<!-- The filters below are necessary if you want to include the Tika
module -->
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>flux-core</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>com.digitalpebble.stormcrawler</groupId>
<artifactId>storm-crawler-core</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>com.digitalpebble.stormcrawler</groupId>
<artifactId>storm-crawler-elasticsearch</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>com.digitalpebble.stormcrawler</groupId>
<artifactId>storm-crawler-tika</artifactId>
<version>1.7</version>
</dependency>
</dependencies>
Upvotes: 0
Views: 717
Reputation: 4864
Your pom and flux files look ok. You could put the injection as part of the main flux to keep things simple.
What's in crawler-conf.yaml? Did you prefix the field names with 'parse.'?
Here is the metadata extracted from the URL you posted above
parse.dcterms:modified: 2004-09-29T20:21:18Z
parse.pdf:PDFVersion: 1.4
parse.access_permission:can_print: true
parse.pdf:docinfo:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.pdf:docinfo:modified: 2004-09-29T20:21:18Z
parse.access_permission:extract_for_accessibility: true
parse.created: Fri Sep 24 15:56:30 BST 2004
parse.pdf:docinfo:created: 2004-09-24T14:56:30Z
parse.xmpTPg:NPages: 7
parse.access_permission:fill_in_form: true
parse.producer: Adobe PDF Library 6.0
parse.pdf:docinfo:title: About Metadata
parse.pdf:docinfo:producer: Adobe PDF Library 6.0
parse.dc:format: application/pdf; version=1.4
parse.access_permission:assemble_document: true
parse.access_permission:modify_annotations: true
parse.dc:title: About Metadata
parse.access_permission:can_print_degraded: true
parse.xmpMM:DocumentID: adobe:docid:indd:de7d50b0-0fc1-11d9-b0d4-cd42e793ca90
parse.xmpMM:DerivedFrom:DocumentID: adobe:docid:indd:a04d199f-0f11-11d9-b74d-bb0abf4f1ab0
parse.title: About Metadata
parse.Creation-Date: 2004-09-24T14:56:30Z
parse.modified: 2004-09-29T20:21:18Z
parse.resourceName: /digitalimag/pdfs/about_metadata.pdf
parse.dc:description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.Last-Save-Date: 2004-09-29T20:21:18Z
parse.creator: Adobe Systems Incorporated
parse.pdf:encrypted: false
parse.trapped: False
parse.pdf:docinfo:creator: Adobe Systems Incorporated
parse.date: 2004-09-29T20:21:18Z
parse.meta:save-date: 2004-09-29T20:21:18Z
parse.Author: Adobe Systems Incorporated
parse.X-Parsed-By: org.apache.tika.parser.DefaultParser
parse.X-Parsed-By: org.apache.tika.parser.pdf.PDFParser
parse.pdf:docinfo:creator_tool: Adobe InDesign CS (3.0.1)
parse.dcterms:created: 2004-09-24T14:56:30Z
parse.access_permission:can_modify: true
parse.subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.meta:author: Adobe Systems Incorporated
parse.access_permission:extract_content: true
parse.xmp:CreatorTool: Adobe InDesign CS (3.0.1)
parse.dc:creator: Adobe Systems Incorporated
parse.cp:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.pdf:docinfo:trapped: False
parse.meta:creation-date: 2004-09-24T14:56:30Z
parse.xmpMM:DerivedFrom:InstanceID: de7d50af-0fc1-11d9-b0d4-cd42e793ca90
parse.Last-Modified: 2004-09-29T20:21:18Z
parse.Content-Type: application/pdf
parse.description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
Your conf should contain something like
indexer.md.mapping:
- parse.title=title
- parse.Author=author
As you can guess from the code of the test case, you need to add the file in external/tika/src/test/resources/ and refer to the name of the file in the test code, as with about_metadata.pdf in the example below
@Test
public void testMetadata() throws IOException {
bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(),
new OutputCollector(output));
parse("https://www.adobe.com/digitalimag/pdfs/about_metadata.pdf",
"about_metadata.pdf");
List<List<Object>> outTuples = output.getEmitted();
// single document
Assert.assertEquals(1, outTuples.size());
// metadata
Metadata md = (Metadata) outTuples.get(0).get(2);
Assert.assertTrue(
md.getFirstValue("parse.pdf:docinfo:subject").contains(
"By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient."));
}
UPDATE
on closer inspection, the problem comes from your flux. The redirection bolt sends the tuple to Tika on a bespoke stream named 'tika'. The definition should therefore be
from: "redirection_bolt"
to: "parser_bolt"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "tika"
Upvotes: 2