Reputation: 1658
I have looked at all the answers I can find and the closest one only provides four lines of code and I can't get it working within my larger script.
I want to:
I can't for life of me get the syntax right as there are no complete examples that I can find, only snippets, and chatGPT keeps getting it wrong...and I can't work out what I need to change when looking at all the debug variables.
(Running from VSCode in Win11).
Here is a working script (can run for days without failing, but does fail every now and then) - however, it only does one message at a time...this is no good when my INBOX has 150000+ messages:
!!! WARNING !!! Before you run this (if you try it), you should know this will move messages out of your INBOX and into new sub folders based upon the sender domain in reverse, one level per domain part (why no one has ever built this function into any email client is totally beyond me...it seems so logical...)
from __future__ import print_function
import os.path
import re
import json
import uuid
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
# SCOPES define the Gmail API permissions - delete token.json when changing
SCOPES = ['https://www.googleapis.com/auth/gmail.modify',
'https://www.googleapis.com/auth/gmail.labels']
def main():
"""Shows basic usage of the Gmail API.
Lists the user's Gmail labels.
"""
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'C:/path/to/credentials.json',
SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.json', 'w') as token:
token.write(creds.to_json())
try:
# Create a Gmail API service client
service = build('gmail', 'v1', credentials=creds)
# List messages in blocks of 500
page_token = None
while True:
results = service.users().messages().list(userId='me', labelIds=['INBOX'], maxResults=500, pageToken=page_token).execute()
messages = results.get('messages', [])
if not messages:
print('No messages found.')
break
# Get existing labels
labels = service.users().labels().list(userId='me').execute()
label_names = [label['name'] for label in labels['labels']]
for message in messages:
msg = service.users().messages().get(userId='me', id=message['id']).execute()
sender_email = get_sender_email(msg['payload']['headers'])
sender_domain = sender_email.split('@')[-1]
reversed_domain = '.'.join(reversed(sender_domain.split('.')))
label_path = create_label_path(reversed_domain, labels, label_names, service)
# Reget existing labels including new
for label in labels['labels']:
if label['name'] == label_path:
labelId = label['id']
break
# Apply the reversed sender domain path as a label to the message
if sender_email:
service.users().messages().modify(userId='me', id=message['id'], body={'removeLabelIds': 'INBOX', 'addLabelIds': labelId}).execute()
print(f'Applied label "{label_path}" to {sender_email}')
page_token = results.get('nextPageToken')
if not page_token:
break
except HttpError as error:
# TODO(developer) - Handle errors from gmail API.
print(f'An error occurred: {error}')
def get_sender_email(headers):
hfrom = ''
hrply = ''
hrtrn = ''
hsndr = ''
m = re.compile(r'^[^<]*<?([^@<> ]+@[^@<> ]+)>?$')
for header in headers:
if (header['name'] == 'From') and (m.match(header['value'])):
hfrom = m.match(header['value']).group(1)
elif (header['name'] == 'Reply-To') and (m.match(header['value'])):
hrply = m.match(header['value']).group(1)
elif (header['name'] == 'Return-Path') and (m.match(header['value'])):
hrtrn = m.match(header['value']).group(1)
elif (header['name'] == 'Sender') and (m.match(header['value'])):
hsndr = m.match(header['value']).group(1)
# just because the header field exists doesn't mean it is populated
if len(hfrom) > 0:
return hfrom
elif len(hrply) > 0:
return hrply
elif len(hrtrn) > 0:
return hrtrn
elif len(hsndr) > 0:
return hsndr
return 'nosuchuser@NoSender'
def create_label_path(revdom, labels, label_names, service):
domains = revdom.split('.')
label_path = ''
for domain in domains:
if len(label_path) > 0:
label_path = '/'.join(list((label_path, domain)))
else:
label_path = domain
# Check if the label exists:
# - have to create each label at each level, else
# '/' just becomes part of a single label
if label_path not in label_names:
# Create a new label
label = {'name': label_path}
created_label = service.users().labels().create(userId='me', body=label).execute()
labels['labels'].append(created_label)
label_names.append(label_path)
return label_path
if __name__ == '__main__':
main()
NOW, since this is abominably slow, what I am trying to do is do these updates in batches. (There is some code I haven't devised yet that is required, but it will basically select the message IDs of all messages that come from the same sender domain, then batch update the labels for those).
But before I can do that, I have to first get the message headers using batch.execute()
, and this is not working.
Here is an attempt at request batching (other approaches commented out but left in to see if anyone else can work out how to integrate them - I can't):
from __future__ import print_function
import os.path
import re
import json
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import BatchHttpRequest
from googleapiclient.errors import HttpError
# SCOPES define the Gmail API permissions - delete token.json when changing
SCOPES = ['https://www.googleapis.com/auth/gmail.modify',
'https://www.googleapis.com/auth/gmail.labels']
global headers_array # Added
# choose either callback function or container function
def callback(self, request_id, response, exception):
headers_array.append(response["payload"]["headers"])
# Helper container to store results.
# class DataContainer:
# def __init__(self):
# self.data = {}
# def callback(self, request_id, response, exception):
# if exception is not None:
# print('request_id: {}, exception: {}'.format(request_id, str(exception)))
# pass
# else:
# print(request_id)
# self.data[request_id] = response
# container = DataContainer()
def main():
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file('C:/path/to/credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.json', 'w') as token:
token.write(creds.to_json())
try:
# Create a Gmail API service client
service = build('gmail', 'v1', credentials=creds)
# List all message IDs in blocks of 500; 500 works and is the supported MAX
all_message_ids = []
results = service.users().messages().list(userId='me', labelIds=['INBOX'], maxResults=500).execute()
messages = results.get('messages', [])
all_message_ids.extend([message['id'] for message in messages])
while 'nextPageToken' in results:
page_token = results['nextPageToken']
results = service.users().messages().list(userId='me', labelIds=['INBOX'], maxResults=500, pageToken=page_token).execute()
messages = results.get('messages', [])
all_message_ids.extend([message['id'] for message in messages])
print(f'Total messages: {len(all_message_ids)}')
headers_array = []
# Retrieve headers for each message ID
for i in range(0, len(all_message_ids), 100):
batch_message_ids = all_message_ids[i:i+100]
# messages_batch = batch.add(service.users().messages().get(userId='me', ids=batch_message_ids, format='metadata', metadataHeaders=['From', 'Reply-To', 'Sender', 'Return-Path']))
# for msgid in all_message_ids:
# msg = (service.users().messages().get(userId='me', ids=msgid, format='metadata', metadataHeaders=['From', 'Reply-To', 'Sender', 'Return-Path']).execute())
######## incomplete - not working
# msg_ids = [msg['id'] for msg in body['messages']]
# headers['Content-Type'] = 'multipart/mixed; boundary=%s' % self.BOUNDARY
# post_body = []
# for msg_id in batch_message_ids:
# post_body.append(
# "--%s\n"
# "Content-Type: application/http\n\n"
# "GET /gmail/v1/users/me/messages/%s?format=raw\n"
# % (self.BOUNDARY, msg_id))
# post_body.append("--%s--\n" % self.BOUNDARY)
# post = '\n'.join(post_body)
# (headers, body) = _conn.request(
# SERVER_URL + '/batch',
# method='POST', body=post, headers=headers)
########
batch = service.new_batch_http_request(callback=callback)
for msg_id in batch_message_ids:
batch.add(service.users().messages().get(userId = 'me', id = msg_id, format='metadata', metadataHeaders=['From', 'Reply-To', 'Sender', 'Return-Path']))
batch.execute() # <--- BREAKS HERE WITH EXCEPTION
########
messages_batch = ''; # this is a placeholder so the script runs without syntax errors - I know it is wrong, just not sure how to 'capture' the batch results into a list/array/object to loop through below
for msg in messages_batch['messages']:
# message_id = msg['id']
headers = {
'message_id': msg['id'],
'from': msg["payload"]["headers"][0]["value"],
'reply_to': msg["payload"]["headers"][1]["value"],
'sender': msg["payload"]["headers"][2]["value"],
'return_path': msg["payload"]["headers"][3]["value"]
}
headers_array.append(headers)
# ToDo: do the label replacement here
print(headers_array)
except HttpError as error:
# TODO(developer) - Handle errors from gmail API.
print(f'An error occurred: {error}')
if __name__ == '__main__':
main()
I suspect the batch request is being malformed because I saw one example that added a callback argument, but I don't fully understand the callback syntax yet and am not sure how to integrate it or if it is needed.
What I want to see/receive in the batch response is a JSON object containing the selected headers for all 500 (100?) message IDs sent in the batch request. The loop would then send another batch request with the next 500 (100) message IDs, and so on until I have received the selected headers for all messages in my INBOX.
I have read the Gmail API and Google Apps Scripting documentation (before you suggest it), and to be frank, it is simply lacking details and comprehensive examples (for me). I have also read the API batch guide, but the example given is for a search query or sheets script and not relevant, and I can't work out how to translate it to work with the Gmail API.
I have also tried this with a Google Apps script, but there is a fundamental limitation where there is no apparent way to apply labels to messages, only to threads, where as it is possible to apply a label to a message using Python via the Gmail API (just love inconsistency :-/).
The performance of the working script that does not batch the requests is processing emails at between 1 and 2 per second. I have almost 150,000 to do...
Here is the full Google Apps Script which does everything but label individual messages :(:
function applyDomainLabelAndRemoveInboxLabel() {
var threads = GmailApp.getInboxThreads();
for (var i = 0; i < threads.length; i++) {
var messages = threads[i].getMessages();
for (var j = 0; j < messages.length; j++) {
var message = messages[j];
var msgId = message.getId();
// Logger.log(msgId);
var domlab = getDomainFromMessage(message);
// updateLabels(msgId, domlab[0]);
updateLabels(msgId, domlab);
}
}
}
function getDomainFromMessage(message) {
var rawMessage = message.getRawContent();
var domain = null;
var matchFrom = rawMessage.match(/From:.*<([^>]+)>/i);
var matchReplyTo = rawMessage.match(/Reply-To:.*<([^>]+)>/i);
var matchReturnPath = rawMessage.match(/Return-Path: <([^>]+)>/i);
var matchSender = rawMessage.match(/Sender:.*<([^>]+)>/i);
if (matchFrom) {
domain = extractDomain(matchFrom[1]);
} else if (matchReplyTo) {
domain = extractDomain(matchReplyTo[1]);
} else if (matchReturnPath) {
domain = extractDomain(matchReturnPath[1]);
} else if (matchSender) {
domain = extractDomain(matchSender[1]);
}
return domain ? [domain] : [];
}
function extractDomain(email) {
var parts = email.split('@');
var revdom = null;
if (parts.length === 2) {
revdom = reverseDomain(parts[1]);
return revdom;
}
return null;
}
function reverseDomain(domain) {
var domparts = domain.split('.');
domparts.reverse();
var newLabel = null;
for (var k = 0; k < domparts.length; k++) {
if (k === 0) {
newLabel = domparts[k]
} else {
newLabel = newLabel.concat("/", domparts[k]);
}
var labelObject = GmailApp.getUserLabelByName(newLabel);
// Logger.log(labelObject.getName());
if (!labelObject) {
labelObject = GmailApp.createLabel(newLabel)
}
}
// not sure which of these to return
return labelObject; // can't find a way to get label IDs using GApps Script
// return newLabel;
}
function updateLabels(msgId, addLabel) {
var jsonLabel = JSON.stringify(addLabel)
// BREAK HERE - CANNOT APPLY LABEL TO MESSAGE
var msgLabel = Gmail.Users.Messages.modify({
'addLabelIds': [jsonLabel], // can't get this line right
'removeLabelIds': ['INBOX']
}, 'me', msgId);
}
Upvotes: 1
Views: 589
Reputation: 201378
In the current stage, the official document says as follows. Ref
You're limited to 100 calls in a single batch request. If you need to make more calls than that, use multiple batch requests. Please be careful about this.
And, in your showing script, how about the following modification?
headers_array = []
# Retrieve headers for each message ID
for i in range(0, len(all_message_ids), 500):
batch_message_ids = all_message_ids[i:i+500]
batch = service.new_batch_http_request()
for msg_id in batch_message_ids:
batch.add(service.users().messages().get(userId = 'me', id = msg_id, format='metadata', metadataHeaders=['From', 'Reply-To', 'Sender', 'Return-Path']))
batch.execute() # <--- BREAKS HERE WITH EXCEPTION
global headers_array # Added
headers_array = []
# Retrieve headers for each message ID
for i in range(0, len(all_message_ids), 500):
batch_message_ids = all_message_ids[i:i+100] # Modified
batch = service.new_batch_http_request(callback=sample)
for msg_id in batch_message_ids:
batch.add(service.users().messages().get(userId='me', id=msg_id, format='metadata',metadataHeaders=['From', 'Reply-To', 'Sender', 'Return-Path']))
batch.execute()
print(headers_array)
As the callback function, please add the following function.
def sample(id, res, err):
# print(id)
# print(err)
headers_array.append(res["payload"]["headers"])
From your updated question, if you want to retrieve messages using the batch request by reflecting my proposed modification, unfortunately, I think that you are not correctly using my proposed script. If your middle script is modified using my proposed script, please test the following modified script.
from __future__ import print_function
import os.path
import re
import json
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import BatchHttpRequest
from googleapiclient.errors import HttpError
# SCOPES define the Gmail API permissions - delete token.json when changing
SCOPES = ['https://www.googleapis.com/auth/gmail.modify',
'https://www.googleapis.com/auth/gmail.labels']
global headers_array
headers_array = []
def callback(request_id, msg, exception):
head = msg["payload"]["headers"]
headLen = len(head)
headers = {
'message_id': msg['id'],
'from': head[0]["value"] if headLen > 0 else "",
'reply_to': head[1]["value"] if headLen > 1 else "",
'sender': head[2]["value"] if headLen > 2 else "",
'return_path': head[3]["value"] if headLen > 3 else ""
}
headers_array.append(headers)
def main():
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file('C:/path/to/credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.json', 'w') as token:
token.write(creds.to_json())
try:
# Create a Gmail API service client
service = build('gmail', 'v1', credentials=creds)
all_message_ids = []
results = service.users().messages().list(
userId='me', labelIds=['INBOX'], maxResults=500).execute()
messages = results.get('messages', [])
all_message_ids.extend([message['id'] for message in messages])
while 'nextPageToken' in results:
page_token = results['nextPageToken']
results = service.users().messages().list(userId='me', labelIds=[
'INBOX'], maxResults=500, pageToken=page_token).execute()
messages = results.get('messages', [])
all_message_ids.extend([message['id'] for message in messages])
print(f'Total messages: {len(all_message_ids)}')
for i in range(0, len(all_message_ids), 100):
batch_message_ids = all_message_ids[i:i+100]
batch = service.new_batch_http_request(callback=callback)
for msg_id in batch_message_ids:
batch.add(service.users().messages().get(userId='me', id=msg_id, format='metadata',
metadataHeaders=['From', 'Reply-To', 'Sender', 'Return-Path']))
batch.execute()
print(headers_array)
except HttpError as error:
# TODO(developer) - Handle errors from gmail API.
print(f'An error occurred: {error}')
if __name__ == '__main__':
main()
When I tested this modified script, no error occurred. I confirmed that the headers can be retrieved from the response values of batch requests.
And, this modified script supposes that your client of service
can be used for retrieving the messages using Gmail API. Please be careful about this.
Upvotes: 2