Reputation: 1946
I have an indented file that I need to parsed using java, I need some way to place this in a Section class as shown below
root
root1
text1
text1.1
text1.2
text2
text2.1
text2.2
root2
text1
text1.1
text1.2
text2
text2.1
text2.2.2
I have the class for putting the indented stuff it looks like
public class Section
{
private List<Section> children;
private String text;
private int depth;
public Section(String t)
{
text =t;
}
public List<Section> getChildren()
{
if (children == null)
{
children = new ArrayList<Section>();
}
return children;
}
public void setChildren(List<Section> newChildren)
{
if (newChildren == null) {
children = newChildren;
} else {
if (children == null) {
children = new ArrayList<Section>();
}
for (Section child : newChildren) {
this.addChild(child);
}
}
}
public void addChild(Section child)
{
if (children == null) {
children = new ArrayList<Section>();
}
if (child != null) {
children.add(child);
}
}
public String getText()
{
return text;
}
public void setText(String newText)
{
text =newText;
}
public String getDepth()
{
return depth;
}
public void setDepth(int newDepth)
{
depth = newDepth;
}
}
I need some way to parse the file and place it in expected result which us a Section object which would look like below
Section=
Text="Root"
Children
Child1: Text= "root1"
Child1: "text1"
Child1="Text 1.1"
Child2="Text 1.2"
Child2: "text2"
Child1="Text 2.1"
Child2="Text 2.2"
Children
Child2: Text= "root2"
Child1: "text1"
Child1="Text 1.1"
Child2="Text 1.2"
Child2: "text2"
Child1="Text 2.1"
Child2="Text 2.2"
Here is some code that I have started
int indentCount=0;
while(String text = reader.readline()
{
indentCount=countLeadingSpaces(String word);
//TODO create the section here
}
public static int countLeadingSpaces(String word)
{
int length=word.length();
int count=0;
for(int i=0;i<length;i++)
{
char first = word.charAt(i);
if(Character.isWhitespace(first))
{
count++;
}
else
{
return count;
}
}
return count;
}
Upvotes: 7
Views: 4020
Reputation: 40736
Based on this answer, I've created a C# solution.
It allows for multiple roots and assumes an input structure like:
Test
A
B
C
C1
C2
D
Something
One
Two
Three
An example usage of the code would be:
var lines = new[]
{
"Test",
"\tA",
"\tB",
"\tC",
"\t\tC1",
"\t\tC2",
"\tD",
"Something",
"\tOne",
"\tTwo",
"\tThree"
};
var roots = IndentedTextToTreeParser.Parse(lines, 0, '\t');
var dump = IndentedTextToTreeParser.Dump(roots);
Console.WriteLine(dump);
You can specify the root indention (zero by default) and also the idention character (a tab \t
by default).
The full code:
namespace MyNamespace
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
public static class IndentedTextToTreeParser
{
// https://stackoverflow.com/questions/21735468/parse-indented-text-tree-in-java
public static List<IndentTreeNode> Parse(IEnumerable<string> lines, int rootDepth = 0, char indentChar = '\t')
{
var roots = new List<IndentTreeNode>();
// --
IndentTreeNode prev = null;
foreach (var line in lines)
{
if (string.IsNullOrEmpty(line.Trim(indentChar)))
throw new Exception(@"Empty lines are not allowed.");
var currentDepth = countWhiteSpacesAtBeginningOfLine(line, indentChar);
if (currentDepth == rootDepth)
{
var root = new IndentTreeNode(line, rootDepth);
prev = root;
roots.Add(root);
}
else
{
if (prev == null)
throw new Exception(@"Unexpected indention.");
if (currentDepth > prev.Depth + 1)
throw new Exception(@"Unexpected indention (children were skipped).");
if (currentDepth > prev.Depth)
{
var node = new IndentTreeNode(line.Trim(), currentDepth, prev);
prev.AddChild(node);
prev = node;
}
else if (currentDepth == prev.Depth)
{
var node = new IndentTreeNode(line.Trim(), currentDepth, prev.Parent);
prev.Parent.AddChild(node);
prev = node;
}
else
{
while (currentDepth < prev.Depth) prev = prev.Parent;
// at this point, (currentDepth == prev.Depth) = true
var node = new IndentTreeNode(line.Trim(indentChar), currentDepth, prev.Parent);
prev.Parent.AddChild(node);
}
}
}
// --
return roots;
}
public static string Dump(IEnumerable<IndentTreeNode> roots)
{
var sb = new StringBuilder();
foreach (var root in roots)
{
doDump(root, sb, @"");
}
return sb.ToString();
}
private static int countWhiteSpacesAtBeginningOfLine(string line, char indentChar)
{
var lengthBefore = line.Length;
var lengthAfter = line.TrimStart(indentChar).Length;
return lengthBefore - lengthAfter;
}
private static void doDump(IndentTreeNode treeNode, StringBuilder sb, string indent)
{
sb.AppendLine(indent + treeNode.Text);
foreach (var child in treeNode.Children)
{
doDump(child, sb, indent + @" ");
}
}
}
[DebuggerDisplay(@"{Depth}: {Text} ({Children.Count} children)")]
public class IndentTreeNode
{
public IndentTreeNode(string text, int depth = 0, IndentTreeNode parent = null)
{
Text = text;
Depth = depth;
Parent = parent;
}
public string Text { get; }
public int Depth { get; }
public IndentTreeNode Parent { get; }
public List<IndentTreeNode> Children { get; } = new List<IndentTreeNode>();
public void AddChild(IndentTreeNode child)
{
if (child != null) Children.Add(child);
}
}
}
I've also included an method Dump()
to convert the tree back to a string in order to better debug the algorithm itself.
Upvotes: 4
Reputation: 56
I implemented an alternative solution, using recursive function calls. As far as I can estimate, it will have worse performance than Max Seo's suggestion, especially on deep hierachies. However it is easier to understand (in my opinion), and hence modify to you specific needs. Have a look and let me know if you have any suggestions.
One benefit is that it can handle trees with multiple roots, as it is.
Assuming we have a construct Node, which can contain data and have zero or more children, which are also Nodes. Based on a text input, we want to build trees of nodes, where the data of each node is the content from a line, and the position of the node in tree is indicated by the line position and indentation, so that a line which is indented is the child of the first previous line, which is less indented.
Assuming we have a list of lines, define a function which:
Calling the function with a list of lines, will result in a list of trees, based on their indent. If the tree has only one root, the resulting tree will be the first element of the result list.
List<Node> LinesToTree( List<Line> lines )
{
if(lines.count >= 2)
{
firstLine = lines.shift
nextLine = lines[0]
children = List<Line>
while(nextLine != null && firstLine.indent < nextLine.indent)
{
children.add(lines.shift)
nextLine = lines[0]
}
firstLineNode = new Node
firstLineNode.data = firstLine.data
firstLineNode.children = LinesToTree(children)
resultNodes = new List<Node>
resultNodes.add(firstLineNode)
if(lines.count > 0)
{
siblingNodes = LinesToTree(lines)
resultNodes.addAll(siblingNodes)
return resultNodes
}
else
{
return resultNodes
}
}
elseif()
{
nodes = new List<Node>
node = new Node
node.data = lines[0].data
node.children = new List<Node>
return nodes
}
else
{
return new List<Node>
}
}
The implementation is customisable through the delegate to get the indent, and the name of the children field in output array.
public static function IndentedLinesToTreeArray(array $lineArrays, callable $getIndent = null, $childrenFieldName = "children")
{
//Default function to get element indentation
if($getIndent == null){
$getIndent = function($line){
return $line["indent"];
};
}
$lineCount = count($lineArrays);
if($lineCount >= 2)
{
$firstLine = array_shift($lineArrays);
$children = [];
$nextLine = $lineArrays[0];
while($getIndent($firstLine) < $getIndent($nextLine)){
$children[] = array_shift($lineArrays);
if(!isset($lineArrays[0])){
break;
}
$nextLine = $lineArrays[0];
}
$firstLine[$childrenFieldName] = self::IndentedLinesToTreeArray($children, $getIndent, $childrenFieldName);
if(count($lineArrays)){
return array_merge([$firstLine],self::IndentedLinesToTreeArray($lineArrays, $getIndent, $childrenFieldName));
}else{
return [$firstLine];
}
}
elseif($lineCount == 1)
{
$lineArrays[0][$childrenFieldName] = [];
return $lineArrays;
}
else
{
return [];
}
}
Upvotes: 2
Reputation: 8356
I added a parent pointer as well. Maybe the text can be parsed without it, but parent pointers make it easier. First of all, you need to have more constructors:
static final int root_depth = 4; // assuming 4 whitespaces precede the tree root
public Section(String text, int depth) {
this.text = text;
this.depth = depth;
this.children = new ArrayList<Section>();
this.parent = null;
}
public Section(String text, int depth, Section parent) {
this.text = text;
this.depth = depth;
this.children = new ArrayList<Section>();
this.parent = parent;
}
Then, when you start parsing the file, read it line by line:
Section prev = null;
for (String line; (line = bufferedReader.readLine()) != null; ) {
if (prev == null && line begins with root_depth whitespaces) {
Section root = new Section(text_of_line, root_depth);
prev = root;
}
else {
int t_depth = no. of whitespaces at the beginning of this line;
if (t_depth > prev.getDepth())
// assuming that empty sections are not allowed
Section t_section = new Section(text_of_line, t_depth, prev);
prev.addChild(t_section);
}
else if (t_depth == prev.getDepth) {
Section t_section = new Section(text_of_line, t_depth, prev.getParent());
prev.getParent().addChild(t_section);
}
else {
while (t_depth < prev.getDepth()) {
prev = prev.getParent();
}
// at this point, (t_depth == prev.getDepth()) = true
Section t_section = new Section(text_of_line, t_depth, prev.getParent());
prev.getParent().addChild(t_section);
}
}
}
I have glossed over some finer points of the pseudo-code, but I think you get the overall idea of how to go about this parsing. Do remember to implement the methods addChild(), getDepth(), getParent(), etc.
Upvotes: 4
Reputation: 216
Surprisingly complex problem... but here's a pseudocode
intialize a stack
push first line to stack
while (there are more lines to read) {
S1 = top of stack // do not pop off yet
S2 = read a line
if depth of S1 < depth of S2 {
add S2 as child of S1
push S2 into stack
}
else {
while (depth of S1 >= depth of S2 AND there are at least 2 elements in stack) {
pop stack
S1 = top of stack // do not pop
}
add S2 as child of S1
push S2 into stack
}
}
return bottom element of stack
where depth is the # leading whitespaces. You might have to either modify or wrap the Section class to store the depth of the line.
Upvotes: 6