Reputation: 2543
This is a program to parse some sites. The first site is site1. All the logic to parse that perticular site is located to (-> config :site1)
(ns program.core
(require [net.cgrand.enlive-html :as html]))
(def config
{:site1
{:site-url
["http://www.site1.com/page/1"
"http://www.site1.com/page/2"
"http://www.site1.com/page/3"
"http://www.site1.com/page/4"]
:url-encoding "iso-8859-1"
:parsing-index
{:date
{:selector
[[:td.PadMed (html/nth-of-type 1)] :table [:tr (html/nth-of-type 2)]
[:td (html/nth-of-type 3)] [:span]]
:trimming-fn
(comp first :content) ; (first) to remove extra parenthese
}
:title
{:selector
[[:td.PadMed (html/nth-of-type 1)] :table :tr [:td (html/nth-of-type 2)] [:a]]
:trimming-fn
(comp first :content first :content)
}
:url
{:selector
[[:td.PadMed (html/nth-of-type 1)] :table :tr [:td (html/nth-of-type 2)] [:a]]
:trimming-fn
#(str "http://www.site.com" (:href (:attrs %)))
}
}
}})
;=== Fetch fn ===;
(defn fetch-encoded-url
([url] (fetch-encoded-url url "utf-8"))
([url encoding] (-> url java.net.URL.
.getContent
(java.io.InputStreamReader. encoding)
html/html-resource)))
Now I want to parse the pages contained in (-> config :site1 :site-url) In this example I use only the first url, but how can i design this to actually do kind of a master for
for all the URLs?
(defn parse-element [element]
(into [] (map (-> config :site1 :parsing-index element :trimming-fn)
(html/select
(fetch-encoded-url
(-> config :site1 :site-url first)
(-> config :site1 :url-encoding))
(-> config :site1 :parsing-index element :selector)))))
(def element-lists
(apply map vector
(map parse-element (-> config :site1 :parsing-index keys))))
(def tagged-lists
(into [] (for [element-list element-lists]
(zipmap [:date :title :url] element-list))))
;==== Fn call ====
(println tagged-lists)
Upvotes: 0
Views: 245
Reputation: 3179
Pass :site1
as an argument to parse-element
and elements-list
.
(defn parse-element [site element]
(into [] (map (-> config site :parsing-index element :trimming-fn)
(html/select
(fetch-encoded-url
(-> config site :site-url first)
(-> config site :url-encoding))
(-> config site :parsing-index element :selector)))))
(def element-lists [site]
(apply map vector
(map (partial parse-element site) (-> config site :parsing-index keys))))
And then map over :site1
:site2
… keys.
Addendum in answer to the further question in the comments.
You could wrap the html/select
in a map
over the :site-url
s. Something like:
(defn parse-element [site element]
(let [site-urls (-> config site :site-url)]
(into [] (map (-> config site :parsing-index element :trimming-fn)
map
#(html/select
(fetch-encoded-url
%
(-> config site :url-encoding))
(-> config site :parsing-index element :selector)))
site-urls)))
(I hope I got the parens right.)
Then you'll probably need to check the :trimming-fn, in order for it to handle the nesting. An apply
should suffice.
Upvotes: 1