Reputation: 377
I want to split a string into tokens.
I ripped of another Stack Overflow question - Equivalent to StringTokenizer with multiple characters delimiters, but I want to know if this can be done with only string methods (.equals(), .startsWith(), etc.). I don't want to use RegEx's, the StringTokenizer class, Patterns, Matchers or anything other than String
for that matter.
For example, this is how I want to call the method
String[] delimiters = {" ", "==", "=", "+", "+=", "++", "-", "-=", "--", "/", "/=", "*", "*=", "(", ")", ";", "/**", "*/", "\t", "\n"};
String splitString[] = tokenizer(contents, delimiters);
And this is the code I ripped of the other question (I don't want to do this).
private String[] tokenizer(String string, String[] delimiters) {
// First, create a regular expression that matches the union of the
// delimiters
// Be aware that, in case of delimiters containing others (example &&
// and &),
// the longer may be before the shorter (&& should be before &) or the
// regexpr
// parser will recognize && as two &.
Arrays.sort(delimiters, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return -o1.compareTo(o2);
}
});
// Build a string that will contain the regular expression
StringBuilder regexpr = new StringBuilder();
regexpr.append('(');
for (String delim : delimiters) { // For each delimiter
if (regexpr.length() != 1)
regexpr.append('|'); // Add union separator if needed
for (int i = 0; i < delim.length(); i++) {
// Add an escape character if the character is a regexp reserved
// char
regexpr.append('\\');
regexpr.append(delim.charAt(i));
}
}
regexpr.append(')'); // Close the union
Pattern p = Pattern.compile(regexpr.toString());
// Now, search for the tokens
List<String> res = new ArrayList<String>();
Matcher m = p.matcher(string);
int pos = 0;
while (m.find()) { // While there's a delimiter in the string
if (pos != m.start()) {
// If there's something between the current and the previous
// delimiter
// Add it to the tokens list
res.add(string.substring(pos, m.start()));
}
res.add(m.group()); // add the delimiter
pos = m.end(); // Remember end of delimiter
}
if (pos != string.length()) {
// If it remains some characters in the string after last delimiter
// Add this to the token list
res.add(string.substring(pos));
}
// Return the result
return res.toArray(new String[res.size()]);
}
public static String[] clean(final String[] v) {
List<String> list = new ArrayList<String>(Arrays.asList(v));
list.removeAll(Collections.singleton(" "));
return list.toArray(new String[list.size()]);
}
Edit: I ONLY want to use string methods charAt, equals, equalsIgnoreCase, indexOf, length, and substring
Upvotes: 13
Views: 2501
Reputation: 11881
You can use recursion (a hallmark of functional programming) to make it less verbose.
public static String[] tokenizer(String text, String[] delims) {
for(String delim : delims) {
int i = text.indexOf(delim);
if(i >= 0) {
// recursive call
String[] tail = tokenizer(text.substring(i + delim.length()), delims);
// return [ head, middle, tail.. ]
String[] list = new String[tail.length + 2];
list[0] = text.substring(0,i);
list[1] = delim;
System.arraycopy(tail, 0, list, 2, tail.length);
return list;
}
}
return new String[] { text };
}
Tested it using the same unit-test from the other answer
public static void main(String ... params) {
String haystack = "abcdefghijklmnopqrstuvwxyz";
String [] needles = new String [] { "def", "tuv" };
String [] tokens = tokenizer(haystack, needles);
for (String string : tokens) {
System.out.println(string);
}
}
Output
abc
def
ghijklmnopqrs
tuv
wxyz
It would be a little more elegant if Java had better native array support.
Upvotes: 1
Reputation: 2255
As simple as I could get it...
public class StringTokenizer {
public static String[] split(String s, String[] tokens) {
Arrays.sort(tokens, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return o2.length()-o1.length();
}
});
LinkedList<String> result = new LinkedList<>();
int j=0;
for (int i=0; i<s.length(); i++) {
String ss = s.substring(i);
for (String token : tokens) {
if (ss.startsWith(token)) {
if (i>j) {
result.add(s.substring(j, i));
}
result.add(token);
j = i+token.length();
i = j-1;
break;
}
}
}
result.add(s.substring(j));
return result.toArray(new String[result.size()]);
}
}
It does a lot of new objects creation - and could be optimized by writing custom startsWith()
implementation that would compare char by char of the string.
@Test
public void test() {
String[] split = StringTokenizer.split("this==is the most>complext<=string<<ever", new String[] {"=", "<", ">", "==", ">=", "<="});
assertArrayEquals(new String[] {"this", "==", "is the most", ">", "complext", "<=", "string", "<", "<", "ever"}, split);
}
passes fine :)
Upvotes: 1
Reputation: 2063
Honestly, you could use Apache Commons Lang. If you check the source code of library you will notice that it doesn't uses Regex. Only String and a lot of flags is used in method [StringUtils.split](http://commons.apache.org/proper/commons-lang/javadocs/api-2.6/org/apache/commons/lang/StringUtils.html#split(java.lang.String, java.lang.String)).
Anyway, take a look in code using the Apache Commons Lang.
import org.apache.commons.lang.StringUtils;
import org.junit.Assert;
import org.junit.Test;
public class SimpleTest {
@Test
public void testSplitWithoutRegex() {
String[] delimiters = {"==", "+=", "++", "-=", "--", "/=", "*=", "/**", "*/",
" ", "=", "+", "-", "/", "*", "(", ")", ";", "\t", "\n"};
String finalDelimiter = "#";
//check if demiliter can be used
boolean canBeUsed = true;
for (String delimiter : delimiters) {
if (finalDelimiter.equals(delimiter)) {
canBeUsed = false;
break;
}
}
if (!canBeUsed) {
Assert.fail("The selected delimiter can't be used.");
}
String s = "Assuming that we have /** or /* all these signals like == and; / or * will be replaced.";
System.out.println(s);
for (String delimiter : delimiters) {
while (s.indexOf(delimiter) != -1) {
s = s.replace(delimiter, finalDelimiter);
}
}
String[] splitted = StringUtils.split(s, "#");
for (String s1 : splitted) {
System.out.println(s1);
}
}
}
I hope it helps.
Upvotes: 1
Reputation: 17493
Maybe I haven't fully understood the question, but I have the impression that you want to rewrite the Java String method split()
. I would advise you to have a look at this function, see how it's done and start from there.
Upvotes: 1
Reputation: 9559
EDIT: My original answer did not quite do the trick, it did not include the delimiters in the resultant array, and used the String.split() method, which was not allowed.
Here's my new solution, which is split into 2 methods:
/**
* Splits the string at all specified literal delimiters, and includes the delimiters in the resulting array
*/
private static String[] tokenizer(String subject, String[] delimiters) {
//Sort delimiters into length order, starting with longest
Arrays.sort(delimiters, new Comparator<String>() {
@Override
public int compare(String s1, String s2) {
return s2.length()-s1.length();
}
});
//start with a list with only one string - the whole thing
List<String> tokens = new ArrayList<String>();
tokens.add(subject);
//loop through the delimiters, splitting on each one
for (int i=0; i<delimiters.length; i++) {
tokens = splitStrings(tokens, delimiters, i);
}
return tokens.toArray(new String[] {});
}
/**
* Splits each String in the subject at the delimiter
*/
private static List<String> splitStrings(List<String> subject, String[] delimiters, int delimiterIndex) {
List<String> result = new ArrayList<String>();
String delimiter = delimiters[delimiterIndex];
//for each input string
for (String part : subject) {
int start = 0;
//if this part equals one of the delimiters, don't split it up any more
boolean alreadySplit = false;
for (String testDelimiter : delimiters) {
if (testDelimiter.equals(part)) {
alreadySplit = true;
break;
}
}
if (!alreadySplit) {
for (int index=0; index<part.length(); index++) {
String subPart = part.substring(index);
if (subPart.indexOf(delimiter)==0) {
result.add(part.substring(start, index)); // part before delimiter
result.add(delimiter); // delimiter
start = index+delimiter.length(); // next parts starts after delimiter
}
}
}
result.add(part.substring(start)); // rest of string after last delimiter
}
return result;
}
Original Answer
I notice you are using Pattern
when you said you only wanted to use String methods.
The approach I would take would be to think of the simplest way possible. I think that is to first replace all the possible delimiters with just one delimiter, and then do the split.
Here's the code:
private String[] tokenizer(String string, String[] delimiters) {
//replace all specified delimiters with one
for (String delimiter : delimiters) {
while (string.indexOf(delimiter)!=-1) {
string = string.replace(delimiter, "{split}");
}
}
//now split at the new delimiter
return string.split("\\{split\\}");
}
I need to use String.replace()
and not String.replaceAll()
because replace()
takes literal text and replaceAll()
takes a regex argument, and the delimiters supplied are of literal text.
That's why I also need a while loop to replace all instances of each delimiter.
Upvotes: 9
Reputation: 1290
Suggestion:
private static int INIT_INDEX_MAX_INT = Integer.MAX_VALUE;
private static String[] tokenizer(final String string, final String[] delimiters) {
final List<String> result = new ArrayList<>();
int currentPosition = 0;
while (currentPosition < string.length()) {
// plan: search for the nearest delimiter and its position
String nextDelimiter = "";
int positionIndex = INIT_INDEX_MAX_INT;
for (final String currentDelimiter : delimiters) {
final int currentPositionIndex = string.indexOf(currentDelimiter, currentPosition);
if (currentPositionIndex < 0) { // current delimiter not found, go to the next
continue;
}
if (currentPositionIndex < positionIndex) { // we found a better one, update
positionIndex = currentPositionIndex;
nextDelimiter = currentDelimiter;
}
}
if (positionIndex == INIT_INDEX_MAX_INT) { // we found nothing, finish up
final String finalPart = string.substring(currentPosition, string.length());
result.add(finalPart);
break;
}
// we have one, add substring + delimiter to result and update current position
// System.out.println(positionIndex + ":[" + nextDelimiter + "]"); // to follow the internals
final String stringBeforeNextDelimiter = string.substring(currentPosition, positionIndex);
result.add(stringBeforeNextDelimiter);
result.add(nextDelimiter);
currentPosition += stringBeforeNextDelimiter.length() + nextDelimiter.length();
}
return result.toArray(new String[] {});
}
Notes:
I ONLY want to use string methods charAt, equals, equalsIgnoreCase, indexOf, length, and substring
Check. The function uses only indexOf()
, length()
and substring()
No, I mean in the returned results. For example, If my delimiter was
{
, and a string wasge{ab
, I would like an array withge
,{
andab
Check:
private static void test() {
final String[] delimiters = { "{" };
final String contents = "ge{ab";
final String splitString[] = tokenizer(contents, delimiters);
final String joined = String.join("", splitString);
System.out.println(Arrays.toString(splitString));
System.out.println(contents.equals(joined) ? "ok" : "wrong: [" + contents + "]#[" + joined + "]");
}
// [ge, {, ab]
// ok
One final remark: I should advice to read about compiler construction, in particular the compiler front end, if one wants to have best practices for this kind of question.
Upvotes: 1
Reputation: 1661
Using only non-regex String methods... I used the startsWith(...) method, which wasn't in the exclusive list of methods that you listed because it does simply string comparison rather than a regex comparison.
The following impl:
public static void main(String ... params) {
String haystack = "abcdefghijklmnopqrstuvwxyz";
String [] needles = new String [] { "def", "tuv" };
String [] tokens = splitIntoTokensUsingNeedlesFoundInHaystack(haystack, needles);
for (String string : tokens) {
System.out.println(string);
}
}
private static String[] splitIntoTokensUsingNeedlesFoundInHaystack(String haystack, String[] needles) {
List<String> list = new LinkedList<String>();
StringBuilder builder = new StringBuilder();
for(int haystackIndex = 0; haystackIndex < haystack.length(); haystackIndex++) {
boolean foundAnyNeedle = false;
String substring = haystack.substring(haystackIndex);
for(int needleIndex = 0; (!foundAnyNeedle) && needleIndex < needles.length; needleIndex ++) {
String needle = needles[needleIndex];
if(substring.startsWith(needle)) {
if(builder.length() > 0) {
list.add(builder.toString());
builder = new StringBuilder();
}
foundAnyNeedle = true;
list.add(needle);
haystackIndex += (needle.length() - 1);
}
}
if( ! foundAnyNeedle) {
builder.append(substring.charAt(0));
}
}
if(builder.length() > 0) {
list.add(builder.toString());
}
return list.toArray(new String[]{});
}
outputs
abc
def
ghijklmnopqrs
tuv
wxyz
Note... This code is demo-only. In the event that one of the delimiters is any empty String, it will behave poorly and eventually crash with OutOfMemoryError: Java heap space after consuming a lot of CPU.
Upvotes: 3
Reputation: 357
As far as i understood your problem you can do something like this -
public Object[] tokenizer(String value, String[] delimeters){
List<String> list= new ArrayList<String>();
for(String s:delimeters){
if(value.contains(s)){
String[] strArr=value.split("\\"+s);
for(String str:strArr){
list.add(str);
if(!list.contains(s)){
list.add(s);
}
}
}
}
Object[] newValues=list.toArray();
return newValues;
}
Now in the main method call this function -
String[] delimeters = {" ", "{", "==", "=", "+", "+=", "++", "-", "-=", "--", "/", "/=", "*", "*=", "(", ")", ";", "/**", "*/", "\t", "\n"};
Object[] obj=st.tokenizer("ge{ab", delimeters); //st is the reference of the other class. Edit this of your own.
for(Object o:obj){
System.out.println(o.toString());
}
Upvotes: 1