Mr Anonymous
Mr Anonymous

Reputation: 41

Invalid URL '': No schema supplied. Perhaps you meant http://?

I want code a python3 script for split URLs and check a string with requests module in python

But some of URLs redirected By refresh meta tag i want python follow this link and a other problem is that : I when use for loop for(!) send request to URLs i receive

<Invalid URL '': No schema supplied. Perhaps you meant http://?> ERROR

if you look at the code below you'll find out better my mean

        fopen2 = open("clean url.txt", "r")
    splurl = fopen2.read().split('\n')
    urlcln = []
    urlcln2 = []
    print(splurl)
    for i in splurl:
        getthis = requests.get(i)
        parserres = BeautifulSoup(getthis.text, 'html.parser')
        print(parserres)
        if "<title>" in str(parserres):
            print('yes')
        else:
            print('no')

this is full of the source code...

import time
import sys
import re
from tqdm.auto import tqdm
import requests
from bs4 import BeautifulSoup
# from colorama import Fore, Back, Style

try:

    def finish():
        print("""

        ==============================

        Duplicated URLs Remover With it Fathers!

        Developed Version ~ 1.0.3 By Ehsan Abafat
         _____ _                          _    _            __       _
        | ____| |__  ___  __ _ _ __      / \  | |__   __ _ / _| __ _| |_
        |  _| | '_ \/ __|/ _` | '_ \    / _ \ | '_ \ / _` | |_ / _` | __|
        | |___| | | \__ \ (_| | | | |  / ___ \| |_) | (_| |  _| (_| | |_
        |_____|_| |_|___/\__,_|_| |_| /_/   \_\_.__/ \__,_|_|  \__,_|\__|

        =============================

        clean URLs successfully!

        time that spent for this proccess:

        """)
        print(toc - tic, "S")

    if sys.argv[1] == '-h':
        print(100*"*")
        print('''

            => ~ Usage : Put dirty URLs in "old url.txt" file and call python3 source.py -?

                    -s : Fully Clean URLs
                    -d : Clean URLs with keeping "/"
                    -f : Clean Duplicated URLs Example : 1 2 3 2 => 1 2 3

           => ~ Developed Version ~ 1.0.3 By Ehsan Abafat

        ''')
        print(100*"*")
    elif sys.argv[1] == '-s':

        tic = time.time()
        f = open("old url.txt", "r")
        flisted = f.read().lower().replace('https', 'http').replace(
            'http://', '').replace('www.', '')
        SmartRemover = re.sub("/(\w+)?", "", flisted).split('\n')
        listurl = []
        dupurl = ["\n"]
        fullclean = []
        print('\n Getting Lines... \n')
        for i in tqdm(SmartRemover):
            if i in listurl:
                dupurl.append(i.strip())
            else:
                listurl.append(i.strip())
        print('\n Cleaning... \n')
        for i in tqdm(listurl):
            if i not in dupurl:
                fullclean.append(i)
        f.close()

        flast = open("clean url.txt", "w")
        for i in fullclean:
            if(i != '\n' and i != '\s' and i != '' and len(i) > 2):
                flast.write('http://'+str(i)+'\n')
        toc = time.time()
        finish()
        flast.close()
    elif sys.argv[1] == '-d':

        tic = time.time()
        f = open("old url.txt", "r")
        flisted = f.read().lower().replace('https', 'http').replace(
            'http://', '').replace('www.', '').split()
        listurl = []
        dupurl = ["\n"]
        fullclean = []
        print('\n Getting Lines... \n')
        for i in tqdm(flisted):
            if i in listurl:
                dupurl.append(i.strip())
            else:
                listurl.append(i.strip())
        print('\n Cleaning... \n')
        for i in tqdm(listurl):
            if i not in dupurl:
                fullclean.append(i)
        f.close()

        flast = open("clean url.txt", "w")
        for i in fullclean:
            if(i != '\n' and i != '\s' and i != '' and len(i) > 2):
                flast.write('http://'+str(i)+'\n')
        toc = time.time()
        finish()
        flast.close()
    elif sys.argv[1] == '-f':

        tic = time.time()
        f = open("old url.txt", "r")
        flisted = f.read().lower().replace('https', 'http').replace(
            'http://', '').replace('www.', '').split()
        listurl = []
        dupurl = ["\n"]
        fullclean = []
        print('\n Getting Lines... \n')
        for i in tqdm(flisted):
            if i in listurl:
                dupurl.append(i.strip())
            else:
                listurl.append(i.strip())
        print('\n Cleaning... \n')
        flast = open("clean url.txt", "w")
        for i in tqdm(listurl):
            print(i)
            if(i != '\n' and i != '\s' and i != '' and len(i) > 2):
                flast.write('http://'+str(i)+'\n')
        f.close()

        toc = time.time()
        finish()
        flast.close()
    else:
        print('unknown command! use python3 source.py -h')

    if len(sys.argv) == 2:
        pass
    elif len(sys.argv) == 3:
        telerikuiVul = '{ "message" : "RadAsyncUpload handler is registered succesfully, however, it may not be accessed directly." }'
        telerikBugCheckADR = "/Telerik.Web.UI.WebResource.axd?type=rau"
        fopen2 = open("clean url.txt", "r")
        splurl = fopen2.read().split('\n')
        urlcln = []
        urlcln2 = []
        print(splurl)
        for i in splurl:
            getthis = requests.get(i)
            parserres = BeautifulSoup(getthis.text, 'html.parser')
            print(parserres)
            if "<title>" in str(parserres):
                print('yes')
            else:
                print('no')
    else:
        if(sys.argv[1] != '-h'):
            print("use '<Python3 source.py -h>' command")
        else:
            print('You are see Usage of This Script!')
except Exception as e:
    print(e)

Upvotes: 2

Views: 14476

Answers (1)

Mladen Milosavljevic
Mladen Milosavljevic

Reputation: 1810

You dont have "http:/" in your url list. www.address.com should be http://www.address.com.

This should fix it

for i in splurl:
    getthis = requests.get("http://" + i)
    parserres = BeautifulSoup(getthis.text, 'html.parser')
    print(parserres)
    if "<title>" in str(parserres):
        print('yes')
    else:
        print('no')

Upvotes: 3

Related Questions