Reputation: 81
I have some sample text like this:
1 Header
bla bla bla...
bla bla bla...
1.1 SubHeader
bla bla bla...
bla bla bla...
I read each page of the .docx
file and I want to store the paragraphs under headers and the header as dictionary:
{"1 Header", "bla bla bla"...},
{"1.1 Subheader", "bla bla bla"...},
where the key is a string
(header name) and the value is Paragraphs
or List<Paragraph>
I know that this can be done with Selection()
, but I don't know how to use it.
for (int i = 1; i<=paragraphCollection.Count;i++)
{
if (isHeader(paragraphCollection[i].Range.Text) || isSubHeader(paragraphCollection[i].Range.Text))
{
//collect paragraphs until we find another header or subheader
}
}
Upvotes: 3
Views: 725
Reputation: 183
I'm not sure this is what you want. My code is far more monstrous.
public List<Dictionary<Tuple<int, String>, List<string>>> MyList = new List<Dictionary<Tuple<int, string>, List<string>>>();
public void IteratingHeadingsForDoSomething()
{
Array listHeadingsString = (Array)(Object)this.ThisApplication.ActiveDocument.GetCrossReferenceItems(WdReferenceType.wdRefTypeHeading); // Collection of heading strings, No location info
int headingsCount = listHeadingsString.Length;
ThisApplication.Selection.GoTo(What: Word.WdGoToItem.wdGoToHeading, Which: Word.WdGoToDirection.wdGoToFirst); //goto first heading
for (int i=1; i <= headingsCount; i++)
{
Dictionary<Tuple<int, string>, List<string>> myDictionary = new Dictionary<Tuple<int, string>, List<string>>();
List<string> paras = new List<string>();
string headingString = listHeadingsString.GetValue(i).ToString();
int headingOutlineLevel = (int)(Word.WdOutlineLevel)ThisApplication.Selection.Range.Paragraphs.OutlineLevel;
Tuple<int, string> heading = new Tuple<int, string> (headingOutlineLevel, headingString) ;
Debug.WriteLine(headingString, headingOutlineLevel);
SelectNextParagraph();
Word.Selection currSelection = ThisApplication.Selection;
if (IsOutlineLevel(ref currSelection) == true)
{
Debug.WriteLine("This is a heading too"); // >>> Skip storing??
GotoPreviousHeading(); // go back to previous heading
}
else
{
currSelection = ThisApplication.Selection;
while (IsOutlineLevel(ref currSelection) == false)
{
if (currSelection.Range.End == ThisApplication.ActiveDocument.Paragraphs.Last.Range.End) // Check if reached end of doc)
{
break;
}
String para = ThisApplication.Selection.Range.Text;
paras.Add(para);
Debug.WriteLine(para);
SelectNextParagraph();
}
GotoPreviousHeading();
}
//if (paras.Count == 0) paras.Add("");
myDictionary.Add(heading, paras);
MyList.Add(myDictionary);
GoToNextHeading();
}
Debug.WriteLine(MyList.ToString());
}
private void GotoPreviousHeading()
{
ThisApplication.Selection.GoTo(What: Word.WdGoToItem.wdGoToHeading, Which: Word.WdGoToDirection.wdGoToPrevious);
}
private void GoToNextHeading()
{
ThisApplication.Selection.GoTo(What: Word.WdGoToItem.wdGoToHeading, Which: Word.WdGoToDirection.wdGoToNext);
}
private void SelectNextParagraph()
{
ThisApplication.Selection.Next(Unit: Word.WdUnits.wdParagraph, Count: 1).Select();
}
private bool IsOutlineLevel(ref Word.Selection mySelection)
{
if (mySelection.Range.Paragraphs.OutlineLevel == Word.WdOutlineLevel.wdOutlineLevelBodyText)
{
return false;
}
else
{
return true;
}
}
Upvotes: 2
Reputation: 81
It looks like I've solved the problem, and all thanks to @Smith.
But my code looks ugly.
static void iterateEachHeading() {
headingAndParagraphs = new Dictionary < Paragraph,
Paragraphs > ();
var listOfEachHeading = doc.GetCrossReferenceItems(WdReferenceType.wdRefTypeHeading);
var headingsCount = listOfEachHeading.Length;
app.Selection.GoTo(What: WdGoToItem.wdGoToHeading, Which: WdGoToDirection.wdGoToFirst);
for (int i = 1; i <= headingsCount; i++) {
Console.WriteLine(listOfEachHeading[i]);
Console.WriteLine(app.Selection.Range.Paragraphs.OutlineLevel);
SelectNextParagraph();
if (isHeader(app.Selection)) {
GotoPreviousHeading();
}
else {
while (!isHeader(app.Selection)) //
{
Console.WriteLine(app.Selection.Range.Text);
if (app.Selection.Next(Unit: WdUnits.wdParagraph, Count: 1) != null) {
SelectNextParagraph();
}
else {
break;
}
}
GotoPreviousHeading();
}
GoToNextHeading();
}
}
static Paragraph getHeadingAsParagraph() {
return app.Selection.Range.Paragraphs.First;
}
static void SelectPreviousParagraph() {
app.Selection.Previous(Unit: WdUnits.wdParagraph, Count: 1).Select();
}
static void GoToNextHeading() {
app.Selection.GoTo(What: WdGoToItem.wdGoToHeading, Which: WdGoToDirection.wdGoToNext);
}
static bool isHeader(Selection mySelection) {
return ! (mySelection.Range.Paragraphs.OutlineLevel == WdOutlineLevel.wdOutlineLevelBodyText);
}
static void SelectNextParagraph() {
app.Selection.Next(Unit: WdUnits.wdParagraph, Count: 1).Select();
}
static void GotoPreviousHeading() {
app.Selection.GoTo(What: WdGoToItem.wdGoToHeading, Which: WdGoToDirection.wdGoToPrevious);
}
static void Main(string[] args) {
app = new Microsoft.Office.Interop.Word.Application();
doc = app.Documents.Open("C:/Users/Demoss77/Desktop/test2.docx", Visible: true, ReadOnly: true);
iterateEachHeading();
app.Quit();
}
The second version of my code, like a beautiful one.
static void iterateEachHeading() {
var listOfEachHeading = doc.GetCrossReferenceItems(WdReferenceType.wdRefTypeHeading);
var headingsCount = listOfEachHeading.Length;
for (int i = 1; i <= headingsCount; i++) {
var start = app.Selection.GoTo(What: WdGoToItem.wdGoToHeading, Which: WdGoToDirection.wdGoToAbsolute, Count: i.ToString()).Start; //set position to the first heading
var end = app.Selection.GoTo(What: WdGoToItem.wdGoToHeading, Which: WdGoToDirection.wdGoToAbsolute, Count: (i + 1).ToString()).End; //set position to the next heading
Paragraphs paragraphs;
if (start < end) {
paragraphs = doc.Range(start, end).Paragraphs;
}
else {
paragraphs = doc.Range(start).Paragraphs;
}
foreach(Paragraph paragraph in paragraphs) {
Console.WriteLine(paragraph.Range.Text);
}
}
}
Upvotes: 3
Reputation: 183
I usually try an algorithm with VBA and translate it to C#..
C# is strong but when it comes to Object reference to MS Word, VBA is better.
Read my code and translate it to your C# code. I think iterating through all paragraphs is not preferable.
(Updated) this algorithm is for a dictionary up to 2d structure.
If you want a structure like JSON, you need to parse the document. In that case I think it'll better to parse .docx xml structure than using MS word objects and methods.
Plus, I did not considered where the first paragraph of the document is not a heading.
Sub IteratingHeadingsForDoSomething()
listHeadingsString = ActiveDocument.GetCrossReferenceItems(wdRefTypeHeading) ' Collection of heading strings, No location info
headingsCount = UBound(listHeadingsString)
Selection.GoTo What:=wdGoToHeading, Which:=wdGoToFirst 'goto first heading
For i = 1 To headingsCount Step 1
Debug.Print listHeadingsString(i) ' put this to the list
Debug.Print Selection.Range.Paragraphs.OutlineLevel ' use outline level
SelectNextParagraph
If IsOutlineLevel(Selection) = True Then
' para under the heading is also a heading
Debug.Print "This is a heading too" ' >>> Skip storing??
GotoPreviousHeading ' go back to previous heading
Else
Do Until IsOutlineLevel(Selection) = True
Debug.Print Selection.Range.Text '>>> Store paragraph separately or concatenatively
SelectNextParagraph
Loop
GotoPreviousHeading
End If
GoToNextHeading
Next i
End Sub
'//////////////////////////////////////////
Sub GoToNextHeading()
Selection.GoTo What:=wdGoToHeading, Which:=wdGoToNext
End Sub
'//////////////////////////////////////////
Sub GotoPreviousHeading()
Selection.GoTo What:=wdGoToHeading, Which:=wdGoToPrevious
End Sub
'//////////////////////////////////////////
Sub SelectNextParagraph()
Selection.Next(Unit:=wdParagraph, Count:=1).Select
End Sub
'//////////////////////////////////////////
Function IsOutlineLevel(mySelection As Selection) As Boolean
If mySelection.Range.Paragraphs.OutlineLevel = wdOutlineLevelBodyText Then
IsOutlineLevel = False
Else
IsOutlineLevel = True
End If
End Function
Upvotes: 2