Reputation: 9825
My current Node.js code creates a stream from a very large USPTO Patent XML file (approx 100mb) and creates a patentGrant object while parsing the XML stream. The patentGrant object includes publication number, publication country, publication date and kind of patent. I am trying to create a database containing all of the patentGrant objects using ElasticSearch. I've successfully added code to connect to the local ElasticSearch DB but I am having trouble understanding the ElasticSearch-js API. I don't know how I should go about uploading the patentGrant object to the DB. From the following tutorial and a previous stackoverflow question I asked here. It seems like I should use the bulk api.
Heres my ParseXml.js code:
var CreateParsableXml = require('./CreateParsableXml.js');
var XmlParserStream = require('xml-stream');
// var Upload2ES = require('./Upload2ES.js');
var parseXml;
var es = require('elasticsearch');
var client = new es.Client({
host: 'localhost:9200'
});
// create xml parser using xml-stream node.js module
parseXml = new XmlParserStream(CreateParsableXml.concatXmlStream('ipg140107.xml'));
parseXml.on('endElement: us-patent-grant', function(patentGrantElement) {
var patentGrant;
patentGrant = {
pubNo: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['doc-number'],
pubCountry: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['country'],
kind: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['kind'],
pubDate: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['date']
};
console.log(patentGrant);
});
parseXml.on('end', function() {
console.log('all done');
});
Upvotes: 1
Views: 1918
Reputation: 7207
The bulk api, as it says in the docs you linked, is used for "index" and "delete" operations.
parseXml.on('endElement: us-patent-grant', function(patentGrantElement) {
var patentGrant;
patentGrant = {
pubNo: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['doc-number'],
pubCountry: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['country'],
kind: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['kind'],
pubDate: patentGrantElement['us-bibliographic-data-grant']['publication-reference']['document-id']['date']
};
client.create({
index: 'myindex',
type: 'mytype',
body: patentGrant,
}, function() {}
)
console.log(patentGrant);
});
without ID, it should create one id as per https://www.elastic.co/guide/en/elasticsearch/reference/1.6/docs-index_.html#_automatic_id_generation
Upvotes: 1