Reputation: 1141
I am building a NodeJS application that can store URLs in a database. I want to use the URLs as a primary key, to avoid duplicates being stored. In order to do this I need the url to be in the simplest form possible, removing extra slashes, arguments and prefixes.
How do I convert all of the URLs listed below into the same string as the first URL listed? Is there a way to do this safely, to account for other variations I might have not listed below?
https://website.com/coolpage/938921/
https://www.website.com/coolpage/938921/
http://website.com/coolpage/938921/
https://website.com/coolpage/938921/
https://website.com/coolpage/938921/?awesome=1
https://website.com/coolpage/938921?awesome=1
https:///website.com//coolpage//938921//
Upvotes: 0
Views: 129
Reputation: 7475
Use the standard Node.js url
module.
Solution:
require('url');
function getBaseUrl(url){
const u = new URL(url);
const result =`${u.host}${u.pathname}`
.split('//').join('/')
.replace('www.', '');
// cut off the trailing '/' character from the result
if (result.length && result[result.length - 1] === '/')
return result.substring(0, result.length - 1)
return result;
}
Test:
const urls = [
"https://website.com/coolpage/938921/",
"https://www.website.com/coolpage/938921/",
"http://website.com/coolpage/938921/",
"https://website.com/coolpage/938921/",
"https://website.com/coolpage/938921/?awesome=1",
"https://website.com/coolpage/938921?awesome=1",
"https:///website.com//coolpage//938921//"
];
for (let i = 0; i < urls.length; i++) {
const u = getBaseUrl(urls[i]);
console.log(`${i}: ${u}`);
}
Console output:
0: website.com/coolpage/938921 1: website.com/coolpage/938921 2: website.com/coolpage/938921 3: website.com/coolpage/938921 4: website.com/coolpage/938921 5: website.com/coolpage/938921 6: website.com/coolpage/938921
Upvotes: 2
Reputation: 1
You can use String.prototype.replace
with RegExp
\/+
to match one or more forward slash characters /
replaced with single /
and String.prototype.match()
with RegExp
/[a-z0-9]+\.[a-z0-9]+(?=\/+)\/[a-z0-9]+(?=\/+)\/[a-z0-9]+/ig
to match hostname and pathname of URL.
let urls = ["https://website.com/coolpage/938921/", "https://www.website.com/coolpage/938921/", "http://website.com/coolpage/938921/", "https://website.com/coolpage/938921/", "https://website.com/coolpage/938921/?awesome=1", "https://website.com/coolpage/938921?awesome=1", "https:///website.com//coolpage//938921//"];
let _URL = "website.com/coolpage/938921";
let replaceForwardSlashes = /\/+/g;
let matchHostAndPathNames = /[a-z0-9]+\.[a-z0-9]+(?=\/+)\/[a-z0-9]+(?=\/+)\/[a-z0-9]+/ig;
let matchedURLS = urls.map(url => url.replace(replaceForwardSlashes,'/').match(matchHostAndPathNames));
console.log(matchedURLS, new Set(...matchedURLS).size === 1, matchedURLS.every(u => u == _URL));
Upvotes: 0
Reputation: 1937
Here you have the function to achieve what you want:
function convertURL(url) {
var urlParts = url.split('/')
var finalURL = ''
urlParts.forEach((p, i) => {
if(finalURL.length == 0){
if(p.includes('.com')){
finalURL += p
}
}
else if (p.length > 0 && i < urlParts.length - 1){
finalURL += '/' + p
}
})
return finalURL
}
var url = convertURL('https://website.com/coolpage/938921/?awesome=1')
console.log(url)
Upvotes: 0