6:[["$","$Le",null,{}],["$","div",null,{"className":"min-h-screen bg-gray-100 p-6","children":[["$","$Lf",null,{}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"QAPage\",\"mainEntity\":{\"@type\":\"Question\",\"name\":\"Nutch Raw Html Saving\",\"text\":\"

I'm trying to get raw html of crawled pages in different files, named as url of the page. Is it possible with Nutch to save the raw html pages in different files by ruling out the indexing part?

\\n\",\"author\":{\"@type\":\"Person\",\"name\":\"İsmet Alkan\"},\"upvoteCount\":2,\"answerCount\":1,\"acceptedAnswer\":{\"@type\":\"Answer\",\"text\":\"

The is no direct way to do that. You will have to do few code modifications.\\nSee this and this.

\\n\",\"author\":{\"@type\":\"Person\",\"name\":\"Tejas Patil\"},\"upvoteCount\":2}}}"}}],["$","div",null,{"className":"bg-white shadow-md rounded-lg p-6 mb-6 relative","children":[["$","div",null,{"className":"absolute top-4 right-4 flex flex-wrap space-x-2","children":[["$","span","nutch",{"className":"bg-blue-600 text-white text-sm px-3 py-1 rounded-full","children":["$","$L10",null,{"href":"/discussion/tag/nutch/1","children":"nutch"}]}]]}],["$","div",null,{"className":"flex items-center mb-4","children":[["$","img",null,{"src":"https://i.sstatic.net/xl94H.jpg?s=256","alt":"İsmet Alkan","className":"w-16 h-16 rounded-full border"}],["$","div",null,{"className":"ml-4","children":[["$","a",null,{"href":"https://stackoverflow.com/users/1275577/%c4%b0smet-alkan","target":"_blank","rel":"noopener noreferrer","className":"text-lg font-semibold text-blue-600 hover:underline","children":"İsmet Alkan"}],["$","p",null,{"className":"text-sm text-gray-500","children":["Reputation: ",5447]}]]}]]}],["$","h1",null,{"className":"text-2xl font-bold text-gray-800 mb-4","children":"Nutch Raw Html Saving"}],["$","p",null,{"className":"text-gray-700 mt-4","dangerouslySetInnerHTML":{"__html":"

I'm trying to get raw html of crawled pages in different files, named as url of the page. Is it possible with Nutch to save the raw html pages in different files by ruling out the indexing part?

\n"}}],["$","div",null,{"className":"text-gray-600 text-sm mt-4","children":[["$","p",null,{"children":["Upvotes: ",2]}],["$","p",null,{"children":["Views: ",2596]}]]}]]}],["$","div",null,{"className":"container mx-auto","children":[["$","h2",null,{"className":"text-2xl font-semibold text-gray-800 mb-6","children":["Answers (",1,")"]}],[["$","div","10150402",{"className":"bg-white shadow-md rounded-lg p-6 mb-6","children":[["$","div",null,{"className":"flex items-center mb-4","children":[["$","img",null,{"src":"https://i.sstatic.net/jiYwk.jpg?s=256","alt":"Tejas Patil","className":"w-12 h-12 rounded-full border"}],["$","div",null,{"className":"ml-4","children":[["$","a",null,{"href":"https://stackoverflow.com/users/1150329/tejas-patil","target":"_blank","rel":"noopener noreferrer","className":"text-lg font-semibold text-blue-600 hover:underline","children":"Tejas Patil"}],["$","p",null,{"className":"text-sm text-gray-500","children":["Reputation: ",6169]}]]}]]}],["$","p",null,{"className":"text-gray-700 mb-4","dangerouslySetInnerHTML":{"__html":"

The is no direct way to do that. You will have to do few code modifications.\nSee this and this.

\n"}}],["$","div",null,{"className":"text-gray-600 text-sm","children":["$","p",null,{"children":["Upvotes: ",2]}]}]]}]]]}],["$","div",null,{"className":"bg-white shadow-md rounded-lg p-6 mt-6","children":[["$","h2",null,{"className":"text-2xl font-semibold text-gray-800 mb-4","children":"Related Questions"}],["$","ul",null,{"className":"list-disc list-inside","children":[["$","li","5824899",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/5824899","className":"text-blue-600 hover:underline","children":"How can I save a "complete" HTML file as single file?"}]}],["$","li","53458476",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/53458476","className":"text-blue-600 hover:underline","children":"Nutch - parse custom HTML elements"}]}],["$","li","44943907",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/44943907","className":"text-blue-600 hover:underline","children":"Apache Nutch fetch and updatedb stages"}]}],["$","li","30655322",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/30655322","className":"text-blue-600 hover:underline","children":"Save unescaped HTML to string"}]}],["$","li","17194003",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/17194003","className":"text-blue-600 hover:underline","children":"How to fix saveHTML(): output conversion failed due to conv error?"}]}],["$","li","17972582",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/17972582","className":"text-blue-600 hover:underline","children":"How to parse content located in specific HTML tags using nutch plugin?"}]}],["$","li","18524831",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/18524831","className":"text-blue-600 hover:underline","children":"How to configure apache nutch to remove all a tags and it's content?"}]}],["$","li","11312294",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/11312294","className":"text-blue-600 hover:underline","children":"How to save an invalid XHTML website as valid XML"}]}],["$","li","800059",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/800059","className":"text-blue-600 hover:underline","children":"Parsing html data with nutch 1.0 and a custom plugin"}]}],["$","li","1068261",{"className":"mb-2","children":["$","$L10",null,{"href":"/discussion/solution/1068261","className":"text-blue-600 hover:underline","children":"Tidy gives non-standard HTML"}]}]]}]]}]]}],["$","$L11",null,{}],["$","$L12",null,{}],["$","$L13",null,{}],["$","$L14",null,{}],["$","$L15",null,{}]]

Nutch Raw Html Saving

Answers (1)

Related Questions