Reputation: 416
I have a directory with n number of subdirectory in each on a sharepoint. Each sub directory has files either with doc or docx extension. I want to read the text(possibly read the content as plane text to parse every string). I know docx2txt but it needs the file to present on local machine(i feel so). Is there any better way?
I am using graph api to scan/browse sharepoint directory.
Looking for some direction please.
import requests
import pathlib
# Copy access_token and specify the MS Graph API endpoint you want to call, e.g. 'https://graph.microsoft.com/v1.0/groups' to get all groups in your organization
#access_token = '{ACCESS TOKEN YOU ACQUIRED PREVIOUSLY}'
url = "https://graph.microsoft.com/v1.0/......"
headers = {
'Authorization': token_result['access_token']
}
consentfilecount=0
clientreportcount = 0
graphlinkcount = 0
while True:
try:
graph_result = requests.get(url=url, headers=headers)
graph_result.raise_for_status()
except:
token_result = client.acquire_token_for_client(scopes=scope)
headers = {
'Authorization': token_result['access_token']
}
if ('value' in graph_result.json()):
for list in graph_result.json()['value']:
for ele in finalReportNames:
if ele.lower() in list["name"].lower():
clientreportcount +=1
response = requests.get(list["webUrl"],headers=headers)#{"Authorization": f"Bearer " +token_result['access_token']})
print(response)
print(list["name"])
print(list["webUrl"])
print(pathlib.Path(list["name"]).suffix)
#print(graph_result.json())
if('@odata.nextLink' in graph_result.json()):
url = graph_result.json()['@odata.nextLink']
graphlinkcount += 1
else:
break
print(consentfilecount)
Upvotes: 0
Views: 354
Reputation: 83
Get SharePoint access token and download your directory url
import requests
import docx2txt
acc_tok = get_sharepoint_access_token()
dir_url = ""your directory url"
**Create headers and try again**
headers = {
"Authorization": f"Bearer {access_token}",
"Accept": "application/json;odata=verbose"
}
response = requests.get(dir_url, headers=headers)
files = response.json()["value"]
for file in files:
file_url = file["@content.downloadUrl"]
file_name = file["name"]
with open(file_name, "wb") as f:
response = requests.get(file_url, headers={"Authorization": f"Bearer {access_token}"})
f.write(response.content)
**Now convert them to plain text**
for file_name in os.listdir():
if file_name.endswith(".docx"):
with open(file_name, "r") as f:
text = docx2txt.process(f)
Upvotes: 0