Simone Fildi
Simone Fildi

Reputation: 13

Word file to Python dictionary

I'm trying to turn a *.docx file with questions into a python dictionary.

The questions have this format:

  1. Question
    a. first answer
    b. second answer
    c. third answer
    d. fourth answer
    e. fifth answer

In the file, the correct answer is the bold one, in this case the third. The word file is built with MS Word bullet points (1. and so on for questions, and a. and so on for answers).

The resulting dictionary should be like:

{
  '1': {  
    
    'question': 'the question text',
    'answer': ['first answer','second answer','third answer','fourth answer','fifth answer'],
    'correct_answer': 2
   },
    
   Other questions...

}

I tried this code:

from docx import *

def is_bold(run):
    return run.bold

# Open the document
doc = Document('sample.docx')

# Create an empty dictionary for questions and answers
questions_and_answers = {}

# Iterate only through paragraphs
for paragraph in doc.paragraphs:
    text = paragraph.text.strip()

    # Check if the paragraph starts with a number and a dot
    if text and text[0].isdigit() and text[1] == '.':
        question_number, question = text.split(' ', 1)
        answer_choices = []
        correct_answer_index = None

        # Continue to the next paragraph that will contain the answers
        next_paragraph = paragraph
        while True:
            next_paragraph = next_paragraph.next_paragraph

            # If there are no more paragraphs or it starts with a number, we've reached the end of the answers
            if not next_paragraph or (next_paragraph.text.strip() and next_paragraph.text.strip()[0].isdigit()):
                break

            next_text = next_paragraph.text.strip()

            # If it starts with a letter and a period, consider it as an answer
            if next_text and next_text[0].isalpha() and next_text[1] == '.':
                answer_run = next_paragraph.runs[0]  # Consider only the first "run" to check the style
                answer_text = next_text[3:]  # Remove the answer format (a., b., c., ...)
                answer_choices.append(answer_text)

                # Check if the answer is bold (hence, correct)
                if is_bold(answer_run):
                    correct_answer_index = len(answer_choices) - 1  # Save the index of the correct answer

        # Add the question and answers to the dictionary
        questions_and_answers[question_number] = {
            'question': question,
            'answers': answer_choices,
            'correct_answer_index': correct_answer_index
        }

# Print the resulting dictionary
for number, data in questions_and_answers.items():
    print(f"{number}: {data['question']}")
    print("Answers:")
    for answer in data['answers']:
        print(f"- {answer}")
    print(f"Index of the correct answer: {data['correct_answer_index']}")
    print()

Unfortunately, I'm getting an empty dictionary. How do I fix this?

Upvotes: 1

Views: 194

Answers (1)

Nps-rf
Nps-rf

Reputation: 89

Related question

According to ReadThedocs.Python-DocX: Style-related objects - _NumberingStyle objects, this functionality is not implemented yet.

But, by merging this solutions, we can do something like this:

import sys
import docx
from docx2python import docx2python as dx2py


def ns_tag_name(node, name):
    if node.nsmap and node.prefix:
        return "{{{:s}}}{:s}".format(node.nsmap[node.prefix], name)
    return name


def descendants(node, desc_strs):
    if node is None:
        return []
    if not desc_strs:
        return [node]
    ret = {}
    for child_str in desc_strs[0]:
        for child in node.iterchildren(ns_tag_name(node, child_str)):
            descs = descendants(child, desc_strs[1:])
            if not descs:
                continue
            cd = ret.setdefault(child_str, [])
            if isinstance(descs, list):
                cd.extend(descs)
            else:
                cd.append(descs)
    return ret


def simplified_descendants(desc_dict):
    ret = []
    for vs in desc_dict.values():
        for v in vs:
            if isinstance(v, dict):
                ret.extend(simplified_descendants(v))
            else:
                ret.append(v)
    return ret


def process_list_data(attrs):
    desc = simplified_descendants(attrs)[0]
    level = int(desc.attrib[ns_tag_name(desc, "val")])
    return level


def collect_list_with_levels():
    fname = r"./doc.docx"
    docd = docx.Document(fname)
    docdpy = dx2py(fname)
    result = []
    docdpy_runs = docdpy.document_runs[0][0][0]
    if len(docd.paragraphs) != len(docdpy_runs):
        print("Lengths don't match. Abort")
        return -1

    subnode_tags = (("pPr",), ("numPr",), ("ilvl",))  # (("pPr",), ("numPr",), ("ilvl", "numId"))  # numId is for matching elements from word/numbering.xml
    for idx, (par, l) in enumerate(zip(docd.paragraphs, docdpy_runs)):
        numbered_attrs = descendants(par._element, subnode_tags)
        is_bold = any(run.font.bold for run in par.runs)
        if numbered_attrs:
            result.append({
                "text": par.text,
                "level": process_list_data(numbered_attrs),
                "bold": is_bold
            })
    return result


def build_qa_dict(docx_content):
    qa_dict = {}
    question_counter = 0
    current_question_text = None
    answers = []
    correct_answer_index = None

    for par in docx_content:
        # Check if paragraph is a question or an answer based on its style or level
        is_question = not par["level"]

        if is_question:
            if current_question_text is not None:
                # Save previous question and answers
                qa_dict[str(question_counter)] = {
                    'question': current_question_text,
                    'answers': answers,
                    'correct_answer': correct_answer_index
                }
            question_counter += 1
            current_question_text = par['text']
            answers = []
            correct_answer_index = None
        else:
            answers.append(par['text'])
            if par['bold']:  # Assuming is_bold attribute is set
                correct_answer_index = len(answers)  # -1 if starts with 0

    # Save the last question
    if current_question_text and answers:
        qa_dict[str(question_counter)] = {
            'question': current_question_text,
            'answers': answers,
            'correct_answer': correct_answer_index
        }

    return qa_dict


if __name__ == "__main__":
    data = collect_list_with_levels()
    qa_dict = build_qa_dict(data)
    print(qa_dict)

Input

input-docx-list

Output

{
  "1": {
    "question": "Question",
    "answers": [
      "Answer",
      "Answer2",
      "AnswerCorrect"
    ],
    "correct_answer": 3
  },
  "2": {
    "question": "Question2",
    "answers": [
      "AnswerNew",
      "AnswerCorrect2",
      "AnswerNew2",
      "AnswerNew3"
    ],
    "correct_answer": 2
  }
}

Upvotes: 0

Related Questions