Paulloed
Paulloed

Reputation: 373

How to copy a google file with Colaboratory?

I need to copy a folder and its entire content (with subfolders) to another folder in Google Drive.

I tried using Colaboratory like this:

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive
%cp -av FOLDER_TO_COPY NEW_FOLDER_COPY

Folders and files are copied except for google files, which gives me the following error:
cp: cannot open 'path_to_file' for reading: Operation not supported
This happends for every .gdoc, .gsheet, .gslides, etc. I cannot convert those files to another format (like .docx or .xlsx) because some have complex formulas that I don't want to screw up.

I also tried this :

from shutil import copyfile

copyfile('path_to_file_to_copy', 
         'destination_path')

But got:
OSError: [Errno 95] Operation not supported: 'path_to_file'

How can I copy Google files using Colaboratory ?

Upvotes: 3

Views: 2197

Answers (2)

Motin
Motin

Reputation: 5063

Google Documents are not really regular files, but you can copy them using Google Colab

Here is a complete Colab Notebook for doing this:

# -*- coding: utf-8 -*-
"""[Tool] Deeply copy shared folders in Google Drive using Google Drive API

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1sLyAqMpCA2bYW2-SQ_OJHPzeHen4nd6S
"""

#@title Notebook to deeply copy files in shared folders in Google Drive
from google.colab import drive

# Mount Google Drive (for access to the folder names & to help set paths)
print('Mounting Google Drive...')
drive.mount('/gdrive')

# Install and import necessary Google API libraries
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

#@title Notebook to Define the source and target paths

# For the source, you can provide a URL, folder ID, or My Drive path.
# For example, if the folder is shared and its URL is:
#   https://drive.google.com/drive/folders/abcdef1234567890
# you can set src_path to that URL or just the folder ID.
src = 'https://drive.google.com/drive/folders/foo'  #@param {type: 'string'}

# For the target path, we assume it's in your My Drive.
target = '/gdrive/MyDrive/folderfoo-cloned' #@param {type: 'string'}

#@title Copy ordinary (not Gsheets, GDocs etc) files
src_path = src

import os
os.makedirs(target, exist_ok=True)
assert os.path.exists(target), f"Target '{target}' doesn't exist!"

target_path = os.path.join(target, os.path.basename(src_path))
print(f'Copying files from "{src_path}" to "{target_path}"...')
os.makedirs(target_path, exist_ok=True)
!cp -rf "$src_path"/* "$target_path"  # also works when source is a shortcut

#@title Now make copies of workspace files which above failed with "Operation not supported"

from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

import os, re, csv

# Global list to store archival records for files.
# Each record will be a dict with: path, createdTime, modifiedTime, fileId, mimeType
archive_records = []

# --- Helper Functions ---

def extract_folder_id(url_or_id):
    """
    If the input is a URL containing '/folders/<folder_id>' or is already a folder ID,
    return the folder ID.
    """
    if "drive.google.com" in url_or_id:
        m = re.search(r'/folders/([-\w]+)', url_or_id)
        if m:
            return m.group(1)
        else:
            raise Exception("Could not extract folder id from URL: " + url_or_id)
    if re.match(r'^[\w-]{10,}$', url_or_id):
        return url_or_id
    raise Exception("Input is neither a valid URL nor a folder ID: " + url_or_id)

def is_url_or_id(path):
    """
    Return True if the input path appears to be a URL or an ID.
    """
    if "drive.google.com" in path:
        return True
    if re.match(r'^[\w-]{10,}$', path):
        return True
    return False

def get_folder_id_from_path(path):
    """
    Given a My Drive path like /gdrive/MyDrive/Folder/Subfolder,
    return the corresponding Drive folder ID by traversing from 'root'.
    """
    if path.startswith('/gdrive/MyDrive/'):
        relative_path = path[len('/gdrive/MyDrive/'):]
    else:
        relative_path = path.strip('/')
    parent_id = 'root'
    for part in relative_path.split('/'):
        query = (
            f"mimeType='application/vnd.google-apps.folder' and "
            f"name='{part}' and '{parent_id}' in parents and trashed=false"
        )
        result = drive_service.files().list(q=query, spaces='drive',
                                             fields='files(id, name)').execute()
        files = result.get('files', [])
        if not files:
            raise Exception(f"Folder not found: {part} under parent ID {parent_id}")
        parent_id = files[0]['id']
    return parent_id

def get_or_create_folder(folder_name, parent_id):
    """
    Look for a folder with the given name under the specified parent.
    If it does not exist, create it and return its ID.
    """
    query = (
        f"mimeType='application/vnd.google-apps.folder' and "
        f"name='{folder_name}' and '{parent_id}' in parents and trashed=false"
    )
    result = drive_service.files().list(q=query, spaces='drive',
                                         fields='files(id, name)').execute()
    files = result.get('files', [])
    if files:
        return files[0]['id']
    file_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [parent_id]
    }
    folder = drive_service.files().create(body=file_metadata, fields='id').execute()
    return folder['id']

def get_or_create_folder_by_path(path):
    """
    Given a My Drive path (e.g., /gdrive/MyDrive/TargetFolder/Subfolder),
    ensure the folder structure exists in Drive and return the final folder's ID.
    """
    if path.startswith('/gdrive/MyDrive/'):
        relative_path = path[len('/gdrive/MyDrive/'):]
    else:
        relative_path = path.strip('/')
    parent_id = 'root'
    for part in relative_path.split('/'):
        parent_id = get_or_create_folder(part, parent_id)
    return parent_id

def copy_folder(src_folder_id, dest_parent_id, new_folder_name=None, current_path=""):
    """
    Recursively copy the folder with ID src_folder_id into the destination parent.
    - new_folder_name: If provided, that name will be used for the destination folder;
      otherwise, the source folder's name is used.
    - current_path: Used to record the relative path in the archival records.
    """
    # Determine the new folder's name.
    if new_folder_name is None:
        folder_meta = drive_service.files().get(fileId=src_folder_id, fields="name").execute()
        new_folder_name = folder_meta['name']
    # Set current_path if not provided.
    if not current_path:
        current_path = new_folder_name

    # Create (or reuse) the destination folder.
    dest_folder_id = get_or_create_folder(new_folder_name, dest_parent_id)

    # List items in the source folder.
    # Request createdTime and modifiedTime along with id, name, mimeType.
    query = f"'{src_folder_id}' in parents and trashed=false"
    page_token = None
    while True:
        response = drive_service.files().list(
            q=query,
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType, createdTime, modifiedTime)',
            pageToken=page_token
        ).execute()
        for file in response.get('files', []):
            file_id = file['id']
            file_name = file['name']
            mimeType = file['mimeType']
            file_created = file.get('createdTime', '')
            file_modified = file.get('modifiedTime', '')
            # Build the full relative path for archival.
            full_path = f"{current_path}/{file_name}"

            if mimeType == 'application/vnd.google-apps.folder':
                print(f"Copying folder: {full_path}")
                # Recursively copy subfolder, updating the path.
                copy_folder(file_id, dest_folder_id, new_folder_name=file_name, current_path=full_path)
            else:
                # Escape single quotes in file_name for query.
                escaped_file_name = file_name.replace("'", "\\'")
                query_file = f"'{dest_folder_id}' in parents and name='{escaped_file_name}' and trashed=false"
                existing_files = drive_service.files().list(
                    q=query_file, spaces='drive', fields='files(id)'
                ).execute().get('files', [])
                if existing_files:
                    print(f"Skipping file (already exists): {full_path}")
                else:
                    print(f"Copying file: {full_path}")
                    drive_service.files().copy(
                        fileId=file_id,
                        body={'name': file_name, 'parents': [dest_folder_id]}
                    ).execute()
                # Record the archival metadata for this file.
                archive_records.append({
                    "path": full_path,
                    "createdTime": file_created,
                    "modifiedTime": file_modified,
                    "fileId": file_id,
                    "mimeType": mimeType
                })
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break
    return dest_folder_id

# --- Main Code ---

# Get the source folder ID:
if is_url_or_id(src_path):
    src_folder_id = extract_folder_id(src_path)
else:
    src_folder_id = get_folder_id_from_path(src_path)
print("Source folder ID:", src_folder_id)

# Ensure the target parent folder exists (create if necessary) and get its ID.
target_parent_id = get_or_create_folder_by_path(target_path)
print("Target parent folder ID:", target_parent_id)

# Determine the new folder name for the cloned folder.
new_folder_name = os.path.basename(src_path.rstrip('/'))
if is_url_or_id(src_path):
    folder_meta = drive_service.files().get(fileId=src_folder_id, fields="name").execute()
    new_folder_name = folder_meta['name']
print("Cloning folder:", new_folder_name)

# Recursively copy the folder and record metadata.
dest_folder_id = copy_folder(src_folder_id, target_parent_id, new_folder_name)
print("Copied folder ID:", dest_folder_id)

# Write archival metadata to CSV.
csv_filename = "archival_metadata.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["path", "createdTime", "modifiedTime", "fileId", "mimeType"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for record in archive_records:
        writer.writerow(record)

print(f"Archival metadata written to: {csv_filename}")

Also available for copy on https://colab.research.google.com/drive/1sLyAqMpCA2bYW2-SQ_OJHPzeHen4nd6S#scrollTo=PkV2BWDMy8_V

Upvotes: 0

mandulaj
mandulaj

Reputation: 763

Google Documents are not really regular files, you can not copy them into Google Colab. If you want to import data from them, you can use a library. For example in order to import Google Sheets data, use gspread

!pip install --upgrade gspread

And then

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open('Your spreadsheet name').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()
print(rows)

# Convert to a DataFrame and render.
import pandas as pd
pd.DataFrame.from_records(rows)

If you want to copy other files, remove the Google Documents from the folder and use the shell command

!cp -r SRC DEST

Upvotes: 2

Related Questions