Reputation: 208
I want to convert all the .doc files from a particular folder to .docx file.
I tried using the following code,
import subprocess
import os
for filename in os.listdir(os.getcwd()):
if filename.endswith('.doc'):
print filename
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', filename])
But it gives me an error: OSError: [Errno 2] No such file or directory
Upvotes: 12
Views: 52988
Reputation: 15219
By default, the os.path.exists() function in Python on Windows is case-insensitive, regardless of whether you have enabled case sensitivity for a specific folder. This means that:
Checking for "Cv.pdf" will return True if "cv.pdf" exists, even if the cases don't match.
If you want to enforce case-sensitive checks for file existence, you can manually check the case using os.listdir()
to compare the actual filenames:
def case_sensitive_exists(file_path):
directory, file_name = os.path.split(file_path)
return file_name in os.listdir(directory)
Upvotes: 0
Reputation: 390
This version uses doc2docx, which I believe only works on windows or mac. I believe this is the cleanest version so far, if you can use windows. To use it, you must install doc2docx first, which can be done from anaconda (or pip).
import doc2docx
from glob import glob
import os
def convert_doc_to_docx(folder):
# Stores all doc files to be removed later
doc_files = glob('{}/*.doc'.format(folder))
# Now do the conversion. Note that doc2docx converts all files in a given folder
doc2docx.convert(folder)
# Remove all old doc_files
for doc_file in doc_files:
os.remove(doc_file)
convert_doc_to_docx('C:/Users/user/folder_containing_doc_files/')
Upvotes: 1
Reputation: 61
based on dshefman's code,
import re
import os
import sys
import win32com.client as win32
from win32com.client import constants
# Get path from command line argument
ABS_PATH = sys.argv[1]
def save_as_docx(path):
# Opening MS Word
word = win32.gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(path)
doc.Activate ()
# Rename path with .docx
new_file_abs = os.path.abspath(path)
new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
# Save and Close
word.ActiveDocument.SaveAs(new_file_abs, FileFormat=constants.wdFormatXMLDocument)
doc.Close(False)
def main():
source = ABS_PATH
for root, dirs, filenames in os.walk(source):
for f in filenames:
filename, file_extension = os.path.splitext(f)
if file_extension.lower() == ".doc":
file_conv = os.path.join(root, f)
save_as_docx(file_conv)
print("%s ==> %sx" %(file_conv,f))
if __name__ == "__main__":
main()
Upvotes: 2
Reputation: 1007
Here is a solution that worked for me. The other solutions proposed did not work on my Windows 10 machine using Python 3.
from glob import glob
import re
import os
import win32com.client as win32
from win32com.client import constants
# Create list of paths to .doc files
paths = glob('C:\\path\\to\\doc\\files\\**\\*.doc', recursive=True)
def save_as_docx(path):
# Opening MS Word
word = win32.gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(path)
doc.Activate ()
# Rename path with .docx
new_file_abs = os.path.abspath(path)
new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
# Save and Close
word.ActiveDocument.SaveAs(
new_file_abs, FileFormat=constants.wdFormatXMLDocument
)
doc.Close(False)
for path in paths:
save_as_docx(path)
Upvotes: 22
Reputation: 477
If you don't like to rely on sub-process calls, here is the version with COM client. It is useful if you are targeting windows users without LibreOffice installed.
#!/usr/bin/env python
import glob
import win32com.client
word = win32com.client.Dispatch("Word.Application")
word.visible = 0
for i, doc in enumerate(glob.iglob("*.doc")):
in_file = os.path.abspath(doc)
wb = word.Documents.Open(in_file)
out_file = os.path.abspath("out{}.docx".format(i))
wb.SaveAs2(out_file, FileFormat=16) # file format for docx
wb.Close()
word.Quit()
Upvotes: 3
Reputation: 5935
I prefer to use the glob
module for tasks like that. Put this in a file doc2docx.py
. To make it executable, set chmod +x
. And optionally put that file in your $PATH
as well, to make it available "everywhere".
#!/usr/bin/env python
import glob
import subprocess
for doc in glob.iglob("*.doc"):
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', doc])
Though ideally you'd leave the expansion to the shell itself, and call doc2docx.py
with the files as arguments, like doc2docx.py *.doc
:
#!/usr/bin/env python
import subprocess
import sys
if len(sys.argv) < 2:
sys.stderr.write("SYNOPSIS: %s file1 [file2] ...\n"%sys.argv[0])
for doc in sys.argv[1:]:
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', doc])
As requested by @pyd, to output to a target directory myoutputdir
use:
#!/usr/bin/env python
import subprocess
import sys
if len(sys.argv) < 2:
sys.stderr.write("SYNOPSIS: %s file1 [file2] ...\n"%sys.argv[0])
for doc in sys.argv[1:]:
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', '--outdir', 'myoutputdir', doc])
Upvotes: 5
Reputation: 4894
Use os.path.join
to specify the correct directory.
import os, subprocess
main_dir = os.path.join('/', 'Users', 'username', 'Desktop', 'foldername')
for filename in os.listdir(main_dir):
if filename.endswith('.doc'):
print filename
subprocess.call(['soffice', '--headless', '--convert-to', 'docx', filename])
Upvotes: 2