Reputation: 2589
my database has plenty of documents looking like this:
{
"_id": "7fa2e319f3b908818d1c6eda9205fc6f",
"_rev": "3-9db3d8cc45c9a45b35c3981011e77bb5",
"Guid": "2d69ba2e-972e-4659-8d3f-35f660229b6d",
"CompanyId": "foo",
"Date": "2021-02-12T08:59:48Z",
"Author": "Me",
...
}
The CompanyId
key appears in all documents and several documents have the same CompanyId
value.
I'm looking for a way to get the document with the most recent Date
for each different CompanyId
value. Said differently, I want an array of documents, each document with a different CompanyId
value.
Here is a map/reduce pair:
function(doc) {
emit(doc.CompanyId, {doc: doc, date: Date.parse(doc.Date)});
}
function(keys, values, rereduce) {
var maxDate = 0;
var maxDoc = '';
for(i = 0; i<values.length; i++) {
if(values[i].date>maxDate) {
maxDate=values[i].date;
maxDoc=values[i].doc;
}
}
return {date: maxDate}
}
If I pass group=true
to my query, the map/reduce pair above successfully returns the dates of the most recent documents by company id:
{
"rows": [
{
"key": "a-testagency-556049-0897",
"value": {
"date": 1613123486000
}
},
{
"key": "a-testagency-556677-1317",
"value": {
"date": 1613123435000
}
}
]
}
But I want more than just the dates in the result, I want the documents themselves so I change the last line to:
return {date: maxDate, doc: maxDoc}
This unfortunately throws errors such as:
"Reduce output must shrink more rapidly: input size: 37325 output size: 37233"
Is it because some company ids have just one matching document?
If I change the return line again to something smaller:
return {date: maxDate, companyId: maxDoc.Author}
Then the query now returns null
as value instead of the date and the author.
Upvotes: 0
Views: 509
Reputation: 4963
The code snippet at the end uses pouchDB to demonstrate one way (among a few) to achieve the desired result. This is a very simple solution, but the result is gained from 2 requests rather than 1 request.
To understand how this works check out the Reduce/Rereduce in the CouchDB documentation, particularly regarding rereduce
. An in depth write up here would just be a rehash of that documentation.
Here's the design document in the demo snippet.
{
"_id": "_design/SO-66173759",
"views": {
"most_recent": {
"map": `function (doc) {
if(doc.CompanyId && doc.Date) {
emit([doc.CompanyId,doc.Date],doc._id);
}
}`,
"reduce": `function(keys,values,rereduce) {
return values[0];
}`
}
}
}
The emit produces the complex key [companyId, Date], value = document _id. For example:
[Acme Amalgamated,2012-07-26T18:57:12.409Z] 7fb19b37-5baa-4c78-b122-fd0bc67253d0
[Acme Amalgamated,2013-08-16T02:53:12.062Z] 93d480a5-a666-4fca-b4a4-c1ba7935de92
[Acme Amalgamated,2013-08-23T22:12:14.401Z] 96a236b1-50d6-4b7e-af45-d05432ca7847
[Acme Amalgamated,2015-05-20T13:17:21.500Z] ad55f6e7-6c61-4793-a75c-9601debd5eaf
[Acme Amalgamated,2015-12-17T09:18:33.741Z] cf1a0844-aa54-42e2-a9d6-16e80f5c420a
[Acme Amalgamated,2016-08-24T04:01:15.551Z] 0658a417-901f-4cc7-a005-c10c72d2720c
[Acme Amalgamated,2020-06-19T07:58:44.680Z] 56126238-496a-475e-9001-8a4a86e055a3 **
[Cyberdyne Systems,2012-02-02T17:07:30.692Z] d8a2b649-10b0-467b-b7e1-304b13f7560a
[Cyberdyne Systems,2012-05-31T17:47:15.607Z] 64699ddb-e8f5-449f-b1cb-804ccddbbfb2
[Cyberdyne Systems,2012-07-11T13:08:00.879Z] 1aa285f4-dc4b-49e4-9f0b-331b3d552dad
[Cyberdyne Systems,2014-08-12T22:43:47.651Z] f5597fb0-e37e-4a62-8ef2-244140b0439b
[Cyberdyne Systems,2015-03-25T04:54:40.459Z] c80f58d3-9fb1-466f-9fd2-2b3f86331fc3
[Cyberdyne Systems,2017-08-13T02:42:49.530Z] c2b2d636-b444-4216-9b0b-e982ea467e51 **
[Lorem Ipsum Inc.,2010-01-26T21:31:34.752Z] d6e2b66d-286f-48b9-b678-7afeca2c3c01
[Lorem Ipsum Inc.,2012-07-04T02:29:20.509Z] c7d528fe-ce4f-4c35-a054-39f726962b4a
[Lorem Ipsum Inc.,2012-10-26T01:21:11.765Z] 773af027-1fdd-4f41-b1b1-8d5169871684
[Lorem Ipsum Inc.,2012-12-15T22:27:12.999Z] 78d60df3-aed6-4cab-8071-4d08ccfd0184
[Lorem Ipsum Inc.,2014-11-04T02:03:49.328Z] 098f600c-8059-44d5-a526-b67f0bbeb609
[Lorem Ipsum Inc.,2017-04-30T14:54:17.967Z] 639ddd9c-9a41-42fd-b1be-7fea8eb4f3ec
[Lorem Ipsum Inc.,2017-11-21T10:45:53.152Z] ee2a7445-276d-4e9c-9f9d-1fa76d87741f **
** Most recent document in companyId group
Generally, emit'ing the document._id is redundant since view queries return the document _id - but there's a good reason to do so in this case which will be apparent later.
Pay close attention how the index is ordered; within each company group, the most recent date is the last entry of a companyId group.
Now the terse reduce function
function(keys,values,rereduce) { return values[0]; }
will produce a pleasant result with the following parameters
{
reduce: true,
group_level: 1,
descending: true
}
The descending
property is the killer here because the reduce
function will receive keys and values in reverse order; by returning the 0th value element, reduce
will always produce the most recent document id according to the Date field, even when rereduce = true
.
Given the prior example view, accessing the view with the above parameters would return the following key/value rows
[Lorem Ipsum Inc.] ee2a7445-276d-4e9c-9f9d-1fa76d87741f
[Cyberdyne Systems] c2b2d636-b444-4216-9b0b-e982ea467e51
[Acme Amalgamated] 56126238-496a-475e-9001-8a4a86e055a3
and there it is - calling _all_docs
with the values (document ids) with the following parameters
{
include_docs: true,
keys: [
"ee2a7445-276d-4e9c-9f9d-1fa76d87741f",
"c2b2d636-b444-4216-9b0b-e982ea467e51",
"56126238-496a-475e-9001-8a4a86e055a3"
]
}
would return the most recently dated documents, distinct by companyId.
Here's a demo snippet. The code generates 20 random documents and produces condensed view data.
Important
Do understand that the entire index is scanned for an unconstrained reduce. If your index contains millions of documents then a more complicated approach may be in order, for example leveraging start/end keys or something interesting involving the _changes feed. YMMV.
async function view_reduce() {
let result = await db.query('SO-66173759/most_recent', {
reduce: true,
group_level: 1,
descending: true
});
// show
gel('view_reduce').innerText = result.rows.map(row => `${row.key}\t${row.value}`).join('\n');
return result;
}
async function showMostRecentDocs() {
// Use results from reduce to get documents
let result = await view_reduce();
// The result row values are document ids; use allDocs to fetch the docs
result = await db.allDocs({
include_docs: true,
keys: result.rows.map(row => row.value)
});
//show
gel('view_most_recent').innerText = result.rows.map(row => [row.doc.CompanyId, row.doc.Date, row.doc.Author].join('\t')).join('\n');
}
async function showViewKeyValues() {
let result = await db.query('SO-66173759/most_recent', {
reduce: false,
include_docs: false
});
//show
gel('view_key_value').innerText =
result.rows.map(row => `[${row.key}]\t${row.value}`).join('\n');
}
async function showViewDocs() {
let result = await db.query('SO-66173759/most_recent', {
reduce: false,
include_docs: true
});
//show
gel('view_docs').innerText = result.rows.map(row => [row.doc.CompanyId, row.doc.Date, row.doc.Author].join('\t')).join('\n');
}
function getDocsToInstall(count) {
const sourceDocs = [{
"CompanyId": "Acme Amalgamated",
"Author": "Wile E. Coyote",
},
{
"CompanyId": "Acme Amalgamated",
"Author": "Road R. Unner",
},
{
"CompanyId": "Lorem Ipsum Inc.",
"Author": "Caesar Augustus",
},
{
"CompanyId": "Lorem Ipsum Inc.",
"Author": "Marcus Aurelius",
},
{
"CompanyId": "Cyberdyne Systems",
"Author": "Miles Dyson",
}
];
// design document
const ddoc = {
"_id": "_design/SO-66173759",
"views": {
"most_recent": {
"map": `function (doc) {
if(doc.CompanyId && doc.Date) {
emit([doc.CompanyId,doc.Date],doc._id);
}
}`,
"reduce": `function(keys,values,rereduce) {
return values[0];
}`
}
}
};
// create a set of random documents.
let docs = new Array(count);
const dateSeed = [new Date(2010, 0, 1), new Date(), 0, 24];
while (count--) {
let doc = Object.assign({}, sourceDocs[Math.random() * sourceDocs.length | 0]);
doc.Date = randomDate(...dateSeed).toISOString();
docs[count] = doc;
}
docs.push(ddoc);
return docs;
}
const db = new PouchDB('SO-66173759', {
adapter: 'memory'
});
// install docs and show view in various forms.
(async() => {
await db.bulkDocs(getDocsToInstall(20));
await showViewDocs();
await showViewKeyValues();
return showMostRecentDocs();
})();
const gel = id => document.getElementById(id);
/*
https://stackoverflow.com/questions/31378526/generate-random-date-between-two-dates-and-times-in-javascript/31379050#31379050
*/
function randomDate(start, end, startHour, endHour) {
var date = new Date(+start + Math.random() * (end - start));
var hour = startHour + Math.random() * (endHour - startHour) | 0;
date.setHours(hour);
return date;
}
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/pouchdb.min.js"></script>
<script src="https://github.com/pouchdb/pouchdb/releases/download/7.1.1/pouchdb.memory.min.js"></script>
<div>View: distinct, most recent</div>
<pre id='view_most_recent'></pre>
<hr/>
<div>View: reduce result</div>
<pre id='view_reduce'></pre>
<hr/>
<div>View: key/value</div>
<pre id='view_key_value'></pre>
<hr/>
<div>View: docs</div>
<pre id='view_docs'></pre>
FWIW
Don't store documents as values! It is extremely inefficient. Although a good reason may exist somewhere in the universe, I've never encountered a legitimate reason to do so.
Upvotes: 2