Reputation: 373
I need to copy a folder and its entire content (with subfolders) to another folder in Google Drive.
I tried using Colaboratory like this:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive
%cp -av FOLDER_TO_COPY NEW_FOLDER_COPY
Folders and files are copied except for google files, which gives me the following error:
cp: cannot open 'path_to_file' for reading: Operation not supported
This happends for every .gdoc, .gsheet, .gslides, etc.
I cannot convert those files to another format (like .docx or .xlsx) because some have complex formulas that I don't want to screw up.
I also tried this :
from shutil import copyfile
copyfile('path_to_file_to_copy',
'destination_path')
But got:
OSError: [Errno 95] Operation not supported: 'path_to_file'
How can I copy Google files using Colaboratory ?
Upvotes: 3
Views: 2197
Reputation: 5063
Google Documents are not really regular files, but you can copy them using Google Colab
Here is a complete Colab Notebook for doing this:
# -*- coding: utf-8 -*-
"""[Tool] Deeply copy shared folders in Google Drive using Google Drive API
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1sLyAqMpCA2bYW2-SQ_OJHPzeHen4nd6S
"""
#@title Notebook to deeply copy files in shared folders in Google Drive
from google.colab import drive
# Mount Google Drive (for access to the folder names & to help set paths)
print('Mounting Google Drive...')
drive.mount('/gdrive')
# Install and import necessary Google API libraries
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
#@title Notebook to Define the source and target paths
# For the source, you can provide a URL, folder ID, or My Drive path.
# For example, if the folder is shared and its URL is:
# https://drive.google.com/drive/folders/abcdef1234567890
# you can set src_path to that URL or just the folder ID.
src = 'https://drive.google.com/drive/folders/foo' #@param {type: 'string'}
# For the target path, we assume it's in your My Drive.
target = '/gdrive/MyDrive/folderfoo-cloned' #@param {type: 'string'}
#@title Copy ordinary (not Gsheets, GDocs etc) files
src_path = src
import os
os.makedirs(target, exist_ok=True)
assert os.path.exists(target), f"Target '{target}' doesn't exist!"
target_path = os.path.join(target, os.path.basename(src_path))
print(f'Copying files from "{src_path}" to "{target_path}"...')
os.makedirs(target_path, exist_ok=True)
!cp -rf "$src_path"/* "$target_path" # also works when source is a shortcut
#@title Now make copies of workspace files which above failed with "Operation not supported"
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')
import os, re, csv
# Global list to store archival records for files.
# Each record will be a dict with: path, createdTime, modifiedTime, fileId, mimeType
archive_records = []
# --- Helper Functions ---
def extract_folder_id(url_or_id):
"""
If the input is a URL containing '/folders/<folder_id>' or is already a folder ID,
return the folder ID.
"""
if "drive.google.com" in url_or_id:
m = re.search(r'/folders/([-\w]+)', url_or_id)
if m:
return m.group(1)
else:
raise Exception("Could not extract folder id from URL: " + url_or_id)
if re.match(r'^[\w-]{10,}$', url_or_id):
return url_or_id
raise Exception("Input is neither a valid URL nor a folder ID: " + url_or_id)
def is_url_or_id(path):
"""
Return True if the input path appears to be a URL or an ID.
"""
if "drive.google.com" in path:
return True
if re.match(r'^[\w-]{10,}$', path):
return True
return False
def get_folder_id_from_path(path):
"""
Given a My Drive path like /gdrive/MyDrive/Folder/Subfolder,
return the corresponding Drive folder ID by traversing from 'root'.
"""
if path.startswith('/gdrive/MyDrive/'):
relative_path = path[len('/gdrive/MyDrive/'):]
else:
relative_path = path.strip('/')
parent_id = 'root'
for part in relative_path.split('/'):
query = (
f"mimeType='application/vnd.google-apps.folder' and "
f"name='{part}' and '{parent_id}' in parents and trashed=false"
)
result = drive_service.files().list(q=query, spaces='drive',
fields='files(id, name)').execute()
files = result.get('files', [])
if not files:
raise Exception(f"Folder not found: {part} under parent ID {parent_id}")
parent_id = files[0]['id']
return parent_id
def get_or_create_folder(folder_name, parent_id):
"""
Look for a folder with the given name under the specified parent.
If it does not exist, create it and return its ID.
"""
query = (
f"mimeType='application/vnd.google-apps.folder' and "
f"name='{folder_name}' and '{parent_id}' in parents and trashed=false"
)
result = drive_service.files().list(q=query, spaces='drive',
fields='files(id, name)').execute()
files = result.get('files', [])
if files:
return files[0]['id']
file_metadata = {
'name': folder_name,
'mimeType': 'application/vnd.google-apps.folder',
'parents': [parent_id]
}
folder = drive_service.files().create(body=file_metadata, fields='id').execute()
return folder['id']
def get_or_create_folder_by_path(path):
"""
Given a My Drive path (e.g., /gdrive/MyDrive/TargetFolder/Subfolder),
ensure the folder structure exists in Drive and return the final folder's ID.
"""
if path.startswith('/gdrive/MyDrive/'):
relative_path = path[len('/gdrive/MyDrive/'):]
else:
relative_path = path.strip('/')
parent_id = 'root'
for part in relative_path.split('/'):
parent_id = get_or_create_folder(part, parent_id)
return parent_id
def copy_folder(src_folder_id, dest_parent_id, new_folder_name=None, current_path=""):
"""
Recursively copy the folder with ID src_folder_id into the destination parent.
- new_folder_name: If provided, that name will be used for the destination folder;
otherwise, the source folder's name is used.
- current_path: Used to record the relative path in the archival records.
"""
# Determine the new folder's name.
if new_folder_name is None:
folder_meta = drive_service.files().get(fileId=src_folder_id, fields="name").execute()
new_folder_name = folder_meta['name']
# Set current_path if not provided.
if not current_path:
current_path = new_folder_name
# Create (or reuse) the destination folder.
dest_folder_id = get_or_create_folder(new_folder_name, dest_parent_id)
# List items in the source folder.
# Request createdTime and modifiedTime along with id, name, mimeType.
query = f"'{src_folder_id}' in parents and trashed=false"
page_token = None
while True:
response = drive_service.files().list(
q=query,
spaces='drive',
fields='nextPageToken, files(id, name, mimeType, createdTime, modifiedTime)',
pageToken=page_token
).execute()
for file in response.get('files', []):
file_id = file['id']
file_name = file['name']
mimeType = file['mimeType']
file_created = file.get('createdTime', '')
file_modified = file.get('modifiedTime', '')
# Build the full relative path for archival.
full_path = f"{current_path}/{file_name}"
if mimeType == 'application/vnd.google-apps.folder':
print(f"Copying folder: {full_path}")
# Recursively copy subfolder, updating the path.
copy_folder(file_id, dest_folder_id, new_folder_name=file_name, current_path=full_path)
else:
# Escape single quotes in file_name for query.
escaped_file_name = file_name.replace("'", "\\'")
query_file = f"'{dest_folder_id}' in parents and name='{escaped_file_name}' and trashed=false"
existing_files = drive_service.files().list(
q=query_file, spaces='drive', fields='files(id)'
).execute().get('files', [])
if existing_files:
print(f"Skipping file (already exists): {full_path}")
else:
print(f"Copying file: {full_path}")
drive_service.files().copy(
fileId=file_id,
body={'name': file_name, 'parents': [dest_folder_id]}
).execute()
# Record the archival metadata for this file.
archive_records.append({
"path": full_path,
"createdTime": file_created,
"modifiedTime": file_modified,
"fileId": file_id,
"mimeType": mimeType
})
page_token = response.get('nextPageToken', None)
if page_token is None:
break
return dest_folder_id
# --- Main Code ---
# Get the source folder ID:
if is_url_or_id(src_path):
src_folder_id = extract_folder_id(src_path)
else:
src_folder_id = get_folder_id_from_path(src_path)
print("Source folder ID:", src_folder_id)
# Ensure the target parent folder exists (create if necessary) and get its ID.
target_parent_id = get_or_create_folder_by_path(target_path)
print("Target parent folder ID:", target_parent_id)
# Determine the new folder name for the cloned folder.
new_folder_name = os.path.basename(src_path.rstrip('/'))
if is_url_or_id(src_path):
folder_meta = drive_service.files().get(fileId=src_folder_id, fields="name").execute()
new_folder_name = folder_meta['name']
print("Cloning folder:", new_folder_name)
# Recursively copy the folder and record metadata.
dest_folder_id = copy_folder(src_folder_id, target_parent_id, new_folder_name)
print("Copied folder ID:", dest_folder_id)
# Write archival metadata to CSV.
csv_filename = "archival_metadata.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as csvfile:
fieldnames = ["path", "createdTime", "modifiedTime", "fileId", "mimeType"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for record in archive_records:
writer.writerow(record)
print(f"Archival metadata written to: {csv_filename}")
Also available for copy on https://colab.research.google.com/drive/1sLyAqMpCA2bYW2-SQ_OJHPzeHen4nd6S#scrollTo=PkV2BWDMy8_V
Upvotes: 0
Reputation: 763
Google Documents are not really regular files, you can not copy them into Google Colab. If you want to import data from them, you can use a library. For example in order to import Google Sheets data, use gspread
!pip install --upgrade gspread
And then
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('Your spreadsheet name').sheet1
# get_all_values gives a list of rows.
rows = worksheet.get_all_values()
print(rows)
# Convert to a DataFrame and render.
import pandas as pd
pd.DataFrame.from_records(rows)
If you want to copy other files, remove the Google Documents from the folder and use the shell command
!cp -r SRC DEST
Upvotes: 2