Reputation: 5011
I know how to extract the values of all href
attributes (of anchor tags) using cheerio
from a responseText
string obtained using request
(or https
) then create a flat object of (unique) URLs.
What I don't understand is how would I create a nested object (of nested objects) using recursion (without manually writing each loop).
This nested
object is of a certain depth (conveniently specified using an argument called depth
).
For example, let's say that this is my code:
function get(url, callback) {
// get "responseText" using "requests"
// extract the anchor tags href from the "responseText"
callback({"https://url1.com": {}, "https://url2.com": {});
}
get("https://www.example.com", (urls) => {
console.log(urls);
});
When you run the code the output should be:
{ "https://url1.com": {}, "https://url2.com": {} }
What I don't understand is how do I recursively go to "https://url1.com"
then get this output:
{ "https://url1.com": { "https://sub-url-1.com": {} }, "https://url2.com": { "https://sub-url-2.com": {} } }
What if there is a depth of 5? How would I recursively loop through each sub URL 5 levels deep then get the sub URLs for it?
What is this type of recursion called and how do I implement it in JavaScript?
Upvotes: 2
Views: 131
Reputation: 135227
Starting with crawl
, it takes a starting url (string) and a starting depth (int) and returns a promised result. Our result is the type (or "shape") of our intended output. In this case, it is an object with url strings as keys and the values are either an empty object or another nested result -
// type url = string
// type result = (url, result) object | empty
// crawl : (string * int) -> result promise
const crawl = (initUrl = '/', initDepth = 0) =>
{ const loop = (urls, depth) =>
parallel
( urls
, u =>
depth === 0
? [ u, {} ]
: loop (get (u), depth - 1)
.then (r => [ u, r ])
)
.then (Object.fromEntries)
return loop ([ initUrl ], initDepth)
}
Vertical styling is not common but helps the eye identify code elements that align on tab-stop vertical rules. Open whitespace allows for comments but they become less necessary as familiarity with the style is gained -
// type url = string
// type result = (url, result) object | empty
// crawl : (string * int) -> result promise
const crawl = (initUrl = '/', initDepth = 0) =>
{ const loop = (urls, depth) =>
parallel // parallel requests
( urls // each url
, u => // as u
depth === 0 // exit condition
? [ u, {} ] // base: [ key, value ]
: loop (get (u), depth - 1) // inductive: smaller problem
.then (r => [ u, r ]) // [ key, value ]
)
.then (Object.fromEntries) // convert [ key, value ]
// to { key: value }
return loop ([ initUrl ], initDepth) // init loop
}
This leverages a generic utility parallel
which is useful for processing a promised array -
// parallel : (('a array) promise * 'a -> 'b) -> ('b array) promise
const parallel = async (p, f) =>
Promise.all ((await p) .map (x => f (x)))
Or if you don't want to rely on async-await
-
// parallel : (('a array) promise * 'a -> 'b) -> ('b array) promise
const parallel = (p, f) =>
Promise.all
( Promise
.resolve (p)
.then (r => r .map (x => f (x)))
)
Given a mocked sitemap
and corresponding get
function -
// sitemap : (string, string array) object
const sitemap =
{ "/": [ "/a", "/b", "/c" ]
, "/a": [ "/a/1", "/a/11", "/a/111" ]
, "/a/1": [ "/a/1/2", "a/1/22" ]
, "/a/1/2": [ "/a/1/2/3" ]
, "/a/1/2/3": [ "/a/1/2/3/4" ]
, "/a/11": [ "/a/11/2", "a/11/22" ]
, "/a/11/22": [ "/a/11/22/33"]
, "/b": [ "/b/1" ]
, "/b/1": [ "/b/1/2" ]
}
// get : string -> (string array) promise
const get = async (url = '') =>
Promise
.resolve (sitemap[url] || [] )
.then (delay)
// delay : ('a * int) -> 'a promise
const delay = (x, ms = 250) =>
new Promise (r => setTimeout (r, ms, x))
We can see how crawl
responds at various depths -
crawl ('/') .then (console.log, console.error)
// { '/': {} }
crawl ('/', 1) .then (console.log, console.error)
// { '/': { '/a': {}, '/b': {}, '/c': {} } }
crawl ('/b', 1) .then (console.log, console.error)
// { '/b': { '/b/1': {} } }
crawl ('/b', 2) .then (console.log, console.error)
// {
// "/b": {
// "/b/1": {
// "/b/1/2": {}
// }
// }
// }
Here we crawl the root "/"
with a depth of Infinity
-
crawl ("/", Infinity) .then (console.log, console.error)
// {
// "/": {
// "/a": {
// "/a/1": {
// "/a/1/2": {
// "/a/1/2/3": {
// "/a/1/2/3/4": {}
// }
// },
// "a/1/22": {}
// },
// "/a/11": {
// "/a/11/2": {},
// "a/11/22": {}
// },
// "/a/111": {}
// },
// "/b": {
// "/b/1": {
// "/b/1/2": {}
// }
// },
// "/c": {}
// }
// }
Simply replace get
with a real function that takes an input url and returns an array of hrefs - crawl
will work just the same.
Expand the snippet below to verify the results in your own browser -
const parallel = async (p, f) =>
Promise.all ((await p) .map (x => f (x)))
const crawl = (initUrl = '/', initDepth = 0) =>
{ const loop = (urls, depth) =>
parallel
( urls
, u =>
depth === 0
? [ u, {} ]
: loop (get (u), depth - 1)
.then (r => [ u, r ])
)
.then (Object.fromEntries)
return loop ([ initUrl ], initDepth)
}
// mock
const sitemap =
{ "/": [ "/a", "/b", "/c" ]
, "/a": [ "/a/1", "/a/11", "/a/111" ]
, "/a/1": [ "/a/1/2", "a/1/22" ]
, "/a/1/2": [ "/a/1/2/3" ]
, "/a/1/2/3": [ "/a/1/2/3/4" ]
, "/a/11": [ "/a/11/2", "a/11/22" ]
, "/a/11/22": [ "/a/11/22/33"]
, "/b": [ "/b/1" ]
, "/b/1": [ "/b/1/2" ]
}
const get = async (url = '') =>
Promise
.resolve (sitemap[url] || [] )
.then (delay)
const delay = (x, ms = 250) =>
new Promise (r => setTimeout (r, ms, x))
// demos
crawl ('/') .then (console.log, console.error)
// { '/': {} }
crawl ('/', 1) .then (console.log, console.error)
// { '/': { '/a': {}, '/b': {}, '/c': {} } }
crawl ('/b', 1) .then (console.log, console.error)
// { '/b': { '/b/1': {} } }
crawl ('/b', 2) .then (console.log, console.error)
// {
// "/b": {
// "/b/1": {
// "/b/1/2": {}
// }
// }
// }
crawl ("/", Infinity) .then (console.log, console.error)
// {
// "/": {
// "/a": {
// "/a/1": {
// "/a/1/2": {
// "/a/1/2/3": {
// "/a/1/2/3/4": {}
// }
// },
// "a/1/22": {}
// },
// "/a/11": {
// "/a/11/2": {},
// "a/11/22": {}
// },
// "/a/111": {}
// },
// "/b": {
// "/b/1": {
// "/b/1/2": {}
// }
// },
// "/c": {}
// }
// }
Upvotes: 2