Reputation: 277
I want to search a large string for all the locations of a string.
Upvotes: 14
Views: 22383
Reputation: 536
Mihran Hovsepyan solution is much faster than std::string::find() when used with strings, but in 2011, there was no std::string_view. Using std::string::find() with std::string_view is muuuch faster, almost as fast as strstr.
Here is a benchmark:
Processing of find_all_indices_knuth_morris_pratt elements took 0.0208348 seconds.
Processing of find_all_indices_strstr elements took 0.0003018 seconds.
Processing of find_all_indices_strfind elements took 0.0704675 seconds.
Processing of find_all_indices_strviewfind elements took 0.0005348 seconds.
2002 2002 2002 2002
Processing of find_all_indices_multi_knuth_morris_pratt elements took 0.978106 seconds.
Processing of find_all_indices_multi_strstr elements took 0.0436946 seconds.
Processing of find_all_indices_multi_strfind elements took 4.37943 seconds.
Processing of find_all_indices_multi_strviewfind elements took 0.0471259 seconds.
have 2002 2002 2002 2002
have 35 35 35 35
public 6006 6006 6006 6006
Code:
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <iostream>
#include <vector>
#include <cstring>
#include <chrono>
#include <unordered_map>
namespace stringmatch {
int strpos(char* haystackx, char* needle)
{
char* p = strstr(haystackx, needle);
if (p)
return p - haystackx;
return -1;
}
std::vector<size_t> find_all_indices_strstr(const std::string& haystack, const std::string& needle)
{
std::vector<size_t> indices;
char* p = ((char*)((size_t)(haystack.c_str())));
char* searchfor = ((char*)((size_t)(needle.c_str())));
int pos = 0;
int offset = 0;
for (;;)
{
pos = strpos(p + offset, searchfor);
if (pos == -1)
break;
offset += pos + 1;
indices.emplace_back(offset - 1);
}
return indices;
}
std::unordered_map<std::string, std::vector<size_t>> find_all_indices_multi_strstr(const std::string& haystack, const std::vector<std::string>& needles)
{
std::unordered_map<std::string, std::vector<size_t>> indicesmap;
indicesmap.reserve(needles.size());
size_t address_of_haystack = (size_t)(haystack.c_str());
char* p = (char*)address_of_haystack;
for (const auto& needle : needles) {
indicesmap[needle].reserve(16);
p = (char*)address_of_haystack;
char* searchfor = ((char*)((size_t)(needle.c_str())));
int pos = 0;
int offset = 0;
for (;;)
{
pos = strpos(p + offset, searchfor);
if (pos == -1)
break;
offset += pos + 1;
indicesmap[needle].emplace_back(offset - 1);
}
}
return indicesmap;
}
void calc_z(size_t mem_address, int len, std::vector<int>& z) {
auto s = (char*)(mem_address);
z.resize(len);
int l = 0, r = 0;
for (int i = 1; i < len; ++i)
if (z[i - l] + i <= r)
z[i] = z[i - l];
else
{
l = i;
if (i > r) r = i;
for (z[i] = r - i; r < len; ++r, ++z[i])
if (s[r] != s[z[i]])
break;
--r;
}
}
std::vector<size_t> find_all_indices_knuth_morris_pratt(const std::string& haystack, const std::string& needle) {
// based on: https://stackoverflow.com/a/5816029/15096247
std::string working_string;
working_string.reserve(haystack.size() * 2);
working_string.append(needle);
working_string.append(haystack);
std::vector<int> z;
std::vector<size_t> results;
size_t mem_address = (size_t)(working_string.c_str());
int len_workingstring = working_string.size();
calc_z(mem_address, len_workingstring, z);
results.reserve(z.size());
for (int i = needle.size(); i < working_string.size(); ++i)
if (z[i] >= needle.size())
results.emplace_back(i - needle.size());
return results;
}
std::unordered_map<std::string, std::vector<size_t>> find_all_indices_multi_knuth_morris_pratt(const std::string& haystack, const std::vector<std::string>& needles) {
std::unordered_map<std::string, std::vector<size_t>> indicesmap;
indicesmap.reserve(needles.size());
std::string working_string;
working_string.reserve(haystack.size() * 2);
std::vector<int> z;
for (auto& needle : needles) {
working_string.append(needle);
working_string.append(haystack);
size_t mem_address = (size_t)(working_string.c_str());
int len_workingstring = working_string.size();
calc_z(mem_address, len_workingstring, z);
indicesmap[needle].reserve(z.size());
for (int i = needle.size(); i < working_string.size(); ++i)
if (z[i] >= needle.size())
indicesmap[needle].emplace_back(i - needle.size());
working_string.clear();
z.clear();
}
return indicesmap;
}
std::vector<size_t> find_all_indices_strfind(const std::string& haystack, const std::string& needle)
{
std::vector<size_t> indices;
size_t offset = 0;
size_t newoffset = 0;
size_t maxlen = haystack.size();
for (;;) {
newoffset = haystack.substr(offset, maxlen).find(needle);
if (newoffset == std::string::npos)
break;
offset += newoffset;
indices.push_back(offset);
maxlen = haystack.size() - offset - needle.size();
offset += needle.size();
}
return indices;
}
std::unordered_map<std::string, std::vector<size_t>> find_all_indices_multi_strfind(const std::string& haystack, const std::vector<std::string>& needles)
{
std::unordered_map<std::string, std::vector<size_t>> indicesmap;
indicesmap.reserve(needles.size());
for (auto& needle : needles) {
indicesmap[needle] = find_all_indices_strfind(haystack, needle);
}
return indicesmap;
}
std::vector<size_t> find_all_indices_strviewfind(const std::string_view haystack, const std::string_view needle)
{
std::vector<size_t> indices;
size_t offset = 0;
size_t newoffset = 0;
size_t maxlen = haystack.size();
for (;;) {
newoffset = haystack.substr(offset, maxlen).find(needle);
if (newoffset == std::string::npos)
break;
offset += newoffset;
indices.push_back(offset);
maxlen = haystack.size() - offset - needle.size();
offset += needle.size();
}
return indices;
}
std::unordered_map<std::string, std::vector<size_t>> find_all_indices_multi_strviewfind(const std::string_view haystack, const std::vector<std::string>& needles)
{
std::unordered_map<std::string, std::vector<size_t>> indicesmap;
indicesmap.reserve(needles.size());
for (auto& needle : needles) {
indicesmap[needle] = find_all_indices_strviewfind(haystack, needle);
}
return indicesmap;
}
}
int main() {
std::chrono::high_resolution_clock::time_point start, end;
std::chrono::duration<double> timespan;
std::string haystack = "These are primarily resources that have public domain texts. Not everything on these sites is in the public domain, so pay attention to dates and copyright notices. I've tried to leave out resources that provide free access, but not necessarily public domain materials. As always, please do your own research before reusing content you find online.These are primarily resources that have public domain texts. Not everything on these sites is in the public domain, so pay attention to dates and copyright notices. I've tried to leave out resources that provide free access, but not necessarily public domain materials. As always, please do your own research before reusing content you find online.";
std::string haystack2 = std::string(haystack);
for (int i = 0; i < 1000; i++) {
haystack += (haystack2);
}
std::string needle = "primarily";
std::vector<std::string> needles{ "have",
"public",
"domain",
"texts",
"Not",
"everything",
"on",
"these",
"sites",
"is",
"in",
"the",
"so",
"pay",
"attention",
"to",
"dates",
"and",
"copyright",
"notices",
"I",
"ve",
"tried",
"leave",
"out",
"resources",
"that",
"provide",
"free",
"access",
"but",
"not",
"necessarily",
"materials",
"As",
"always",
"please",
"do",
"your",
"own",
"research",
"before",
"reusing",
"content",
"you",
"find",
"online",
"These",
"are",
"primarily" };
start = std::chrono::high_resolution_clock::now();
auto result2 = stringmatch::find_all_indices_knuth_morris_pratt(haystack, needle);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_knuth_morris_pratt elements took " << timespan.count() << " seconds." << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result3 = stringmatch::find_all_indices_strstr(haystack, needle);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_strstr elements took " << timespan.count() << " seconds." << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result44 = stringmatch::find_all_indices_strfind(haystack, needle);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_strfind elements took " << timespan.count() << " seconds." << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result55 = stringmatch::find_all_indices_strviewfind(haystack, needle);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_strviewfind elements took " << timespan.count() << " seconds." << std::endl;
std::cout << result2.size() << " " << result3.size() << " " << result44.size() << " " << result55.size() << " " << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result5 = stringmatch::find_all_indices_multi_knuth_morris_pratt(haystack, needles);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_knuth_morris_pratt elements took " << timespan.count() << " seconds." << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result6 = stringmatch::find_all_indices_multi_strstr(haystack, needles);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_strstr elements took " << timespan.count() << " seconds." << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result66 = stringmatch::find_all_indices_multi_strfind(haystack, needles);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_multi_strfind elements took " << timespan.count() << " seconds." << std::endl;
start = std::chrono::high_resolution_clock::now();
auto result77 = stringmatch::find_all_indices_multi_strviewfind(haystack, needles);
end = std::chrono::high_resolution_clock::now();
timespan = duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Processing of find_all_indices_multi_strviewfind elements took " << timespan.count() << " seconds." << std::endl;
for (auto one_needle : needles) {
std::cout << one_needle << " " << result5[one_needle].size() << " " << result6[one_needle].size() << " " << result66[one_needle].size() << " " << result77[one_needle].size() << " " << std::endl;
std::cout << one_needle << " " << result5[one_needle][0] << " " << result6[one_needle][0] << " " << result66[one_needle][0] << " " << result77[one_needle][0] << " " << std::endl;
}
return 0;
}
Upvotes: 0
Reputation: 11108
The two other answers are correct but they are very slow and have O(N^2) complexity. But there is the Knuth-Morris-Pratt algorithm, which finds all substrings in O(N) complexity.
Edit:
Also, there is another algorithm: the so-called "Z-function" with O(N) complexity, but I couldn't find an English source for this algorithm (maybe because there is also another more famous one with same name - the Z-function of Rieman), so I will just put its code here and explain what it does.
void calc_z (string &s, vector<int> & z)
{
int len = s.size();
z.resize (len);
int l = 0, r = 0;
for (int i=1; i<len; ++i)
if (z[i-l]+i <= r)
z[i] = z[i-l];
else
{
l = i;
if (i > r) r = i;
for (z[i] = r-i; r<len; ++r, ++z[i])
if (s[r] != s[z[i]])
break;
--r;
}
}
int main()
{
string main_string = "some string where we want to find substring or sub of string or just sub";
string substring = "sub";
string working_string = substring + main_string;
vector<int> z;
calc_z(working_string, z);
//after this z[i] is maximal length of prefix of working_string
//which is equal to string which starting from i-th position of
//working_string. So the positions where z[i] >= substring.size()
//are positions of substrings.
for(int i = substring.size(); i < working_string.size(); ++i)
if(z[i] >=substring.size())
cout << i - substring.size() << endl; //to get position in main_string
}
Upvotes: 18
Reputation: 38193
Using std::string::find
. You can do something like:
std::string::size_type start_pos = 0;
while( std::string::npos !=
( start_pos = mystring.find( my_sub_string, start_pos ) ) )
{
// do something with start_pos or store it in a container
++start_pos;
}
EDIT: Doh! Thanks for the remark, Nawaz! Better?
Upvotes: 14
Reputation: 33655
I'll add for completeness, there is another approach that is possible with std::search
, works like std::string::find
, difference is that you work with iterators, something like:
std::string::iterator it(str.begin()), end(str.end());
std::string::iterator s_it(search_str.begin()), s_end(search_str.end());
it = std::search(it, end, s_it, s_end);
while(it != end)
{
// do something with this position..
// a tiny optimisation could be to buffer the result of the std::distance - heyho..
it = std::search(std::advance(it, std::distance(s_it, s_end)), end, s_it, s_end);
}
I find that this sometimes outperforms std::string::find
, esp. if you represent your string as a vector<char>
.
Upvotes: 3
Reputation: 55816
Simply use std::string::find()
which returns the position at which the substring was found, or std::string::npos
if none was found.
Here is the documentation.
An here is the example taken from this documentation:
// string::find
#include <iostream>
#include <string>
using namespace std;
int main ()
{
string str ("There are two needles in this haystack with needles.");
string str2 ("needle");
size_t found;
// different member versions of find in the same order as above:
found=str.find(str2);
if (found!=string::npos)
cout << "first 'needle' found at: " << int(found) << endl;
found=str.find("needles are small",found+1,6);
if (found!=string::npos)
cout << "second 'needle' found at: " << int(found) << endl;
found=str.find("haystack");
if (found!=string::npos)
cout << "'haystack' also found at: " << int(found) << endl;
found=str.find('.');
if (found!=string::npos)
cout << "Period found at: " << int(found) << endl;
// let's replace the first needle:
str.replace(str.find(str2),str2.length(),"preposition");
cout << str << endl;
return 0;
}
Upvotes: 2