Reputation: 1390
I have a lot of Images from my Apache server that I want to put to azure. I cannot afford to do it in a sequential manner , SO I will add threading afterwards. I can access those images from a given URL and build a list on that. Easy. Now I do not have enough disk space for downloading the image and uploading it then delete it. I would like something cleaner.
Now is there a method to do that ?
Something like :
block_blob_service.AZURECOMMAND(container, source_URL, target_blob_name)
If not possible, is there a workaround ?
here is the complete code I have today ( download and then upload which I want to avoid ): EDIT : Thanks to Gaurav Mantri I got it now. I update the code.
import requests
from bs4 import BeautifulSoup
from os.path import basename
import os
import sys
import urllib
import urllib2
import urlparse
import argparse
import json
import config
import random
import base64
import datetime
import time
import string
from azure.storage import CloudStorageAccount, AccessPolicy
from azure.storage.blob import BlockBlobService, PageBlobService, AppendBlobService
from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
from azure.storage.blob.models import BlobBlock, ContainerPermissions, ContentSettings
#from azure.storage.blob import BlobService
from azure.storage import *
#from azure.storage.blob.blobservice import BlobService
CURRENT_DIR = os.getcwd()
STORING_DIRECTORY_NAME = "stroage_scrapped_images"
STORING_DIRECTORY = CURRENT_DIR+"/"+STORING_DIRECTORY_NAME
if not os.path.exists(STORING_DIRECTORY):
os.makedirs(STORING_DIRECTORY)
def randomword(length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(length))
startdate = time.clock()
metadata_loaded = {'Owner': 'ToBeAddedSoon', 'Date_Of_Upload': startdate, 'VAR_2': 'VAL_VAR_2','VAR_3': 'VAL_VAR_3','VAR_4': 'VAL_VAR_4'}
with open("credentials.json", 'r') as f:
data = json.loads(f.read())
StoAcc_var_name = data["storagacc"]["Accountname"]
StoAcc_var_key = data["storagacc"]["AccountKey"]
StoAcc_var_container = data["storagacc"]["Container"]
#print StoAcc_var_name, StoAcc_var_key, StoAcc_var_container
def copy_azure_files(source_url,destination_object,destination_container):
blob_service = BlockBlobService(account_name=StoAcc_var_name, account_key=StoAcc_var_key)
blob_service.copy_blob(destination_container, destination_object, source_url)
block_blob_service = BlockBlobService(account_name=StoAcc_var_name, account_key=StoAcc_var_key)
def upload_func(container,blobname,filename):
start = time.clock()
block_blob_service.create_blob_from_path(
container,
blobname,
filename)
elapsed = time.clock()
elapsed = elapsed - start
print "*** DEBUG *** Time spent uploading API " , filename , " is : " , elapsed , " in Bucket/container : " , container
#URL_TARGET = "https://mouradcloud.westeurope.cloudapp.azure.com/blog/blog/category/food/"
URL_TARGET = "https://www.cdiscount.com/search/10/telephone.html"
base_url = URL_TARGET
out_folder = '/tmp'
r = requests.get(URL_TARGET)
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
src = link
image_url = link.get("src")
while image_url is not None :
if 'http' in image_url:
blocks = []
if image_url.endswith(('.png', '.jpg', '.jpeg')):
print " ->>>>>>>>>>>>>> THIS IS AN IMAGE ... PROCESSING "
file_name_downloaded = basename(image_url)
file_name_path_local = STORING_DIRECTORY+"/"+file_name_downloaded
with open(file_name_path_local, "wb") as f:
f.write(requests.get(image_url).content)
filename_in_clouddir="uploads"+"/"+file_name_downloaded
#upload_func(StoAcc_var_container,filename_in_clouddir,file_name_path_local)
copy_azure_files(image_url,filename_in_clouddir,StoAcc_var_container)
break
else :
print " ->>>>>>>>>>>>>> THIS NOT AN IMAGE ... SKIPPING "
break
else :
print " ->>>>>>>>>>>>>> THIS IS A LOCAL IMAGE ... SKIPPING "
break
continue
Upvotes: 1
Views: 1758
Reputation: 136306
Indeed there's something exactly like this: copy_blob
block_blob_service.copy_blob(container, target_blob_name, source_URL)
Please keep in mind that this copy operation is asynchronous server side copying, thus:
UPDATE
Modified code (I have not tried running it)
import requests
from bs4 import BeautifulSoup
from os.path import basename
import os
import sys
import urllib
import urllib2
import urlparse
import argparse
import json
import config
import random
import base64
import datetime
import time
import string
from azure.storage import CloudStorageAccount, AccessPolicy
from azure.storage.blob import BlockBlobService, PageBlobService, AppendBlobService
from azure.storage.models import CorsRule, Logging, Metrics, RetentionPolicy, ResourceTypes, AccountPermissions
from azure.storage.blob.models import BlobBlock, ContainerPermissions, ContentSettings
CURRENT_DIR = os.getcwd()
STORING_DIRECTORY_NAME = "stroage_scrapped_images"
STORING_DIRECTORY = CURRENT_DIR+"/"+STORING_DIRECTORY_NAME
if not os.path.exists(STORING_DIRECTORY):
os.makedirs(STORING_DIRECTORY)
def randomword(length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(length))
startdate = time.clock()
metadata_loaded = {'Owner': 'ToBeAddedSoon', 'Date_Of_Upload': startdate, 'VAR_2': 'VAL_VAR_2','VAR_3': 'VAL_VAR_3','VAR_4': 'VAL_VAR_4'}
with open("credentials.json", 'r') as f:
data = json.loads(f.read())
StoAcc_var_name = data["storagacc"]["Accountname"]
StoAcc_var_key = data["storagacc"]["AccountKey"]
StoAcc_var_container = data["storagacc"]["Container"]
#print StoAcc_var_name, StoAcc_var_key, StoAcc_var_container
block_blob_service = BlockBlobService(account_name=StoAcc_var_name, account_key=StoAcc_var_key)
def upload_func(container,blobname,sourceurl):
start = time.clock()
block_blob_service.copy_blob(
container,
blobname,
sourceurl)
elapsed = time.clock()
elapsed = elapsed - start
print "*** DEBUG *** Time spent uploading API " , filename , " is : " , elapsed , " in Bucket/container : " , container
#URL_TARGET = "https://mouradcloud.westeurope.cloudapp.azure.com/blog/blog/category/food/"
URL_TARGET = "https://www.cdiscount.com/search/10/telephone.html"
base_url = URL_TARGET
out_folder = '/tmp'
r = requests.get(URL_TARGET)
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
src = link
image_url = link.get("src")
while image_url is not None :
if 'http' in image_url:
blocks = []
if image_url.endswith(('.png', '.jpg', '.jpeg')):
print " ->>>>>>>>>>>>>> THIS IS AN IMAGE ... PROCESSING "
file_name_downloaded = basename(image_url)
filename_in_clouddir="uploads"+"/"+file_name_downloaded
upload_func(StoAcc_var_container,filename_in_clouddir,image_url)
break
else :
print " ->>>>>>>>>>>>>> THIS NOT AN IMAGE ... SKIPPING "
break
else :
print " ->>>>>>>>>>>>>> THIS IS A LOCAL IMAGE ... SKIPPING "
break
continue
Upvotes: 1