samm
samm

Reputation: 712

string search across multiple buffers algorithm

I'm developing a NGINX module and need to do a complex string replacement in the response body on the fly without a cumulative buffer (See the below ngx_http_output_body_filter_by_me). Sometime, the buffer in chain cannot hold all response like finding "FGH" in {"ABC", "DEF", "GHI"} illustrated by One Small Caveat of Socket Buffer, so I have to save the match context to continue at the next time.

Is there a ready-made library in C/C++ to seach multiple buffers for a string?

ngx_int_t (*ngx_http_output_body_filter_pt)(ngx_http_request_t *r, ngx_chain_t *chain)
// A callback to a body filter. In modules this is normally used as follows:
static ngx_http_output_body_filter_pt ngx_http_next_body_filter;



// https://tengine.taobao.org/book/chapter_12.html#subrequest-99

typedef struct ngx_http_my_ctx_s {
    const char* pattern_pending; // save the position if partial match
} ngx_http_my_ctx_t;


//https://serverfault.com/questions/480352/modify-data-being-proxied-by-nginx-on-the-fly

/* Intercepts HTTP Response Message Body by our module
 * \param r the request structure pointer containing the details of a request and response
 * \param chain the chained buffers containing the received response this time
 */
ngx_int_t ngx_http_output_body_filter_by_me(ngx_http_request_t *r, ngx_chain_t *chain) {
    // TODO Auto-generated method stub
    //logdf("%.*s", ARGS_NGX_STR(req->unparsed_uri));
    const char* pattern = "substring";
    size_t pattern_length = strlen(pattern);
    const char* pattern_pending;
    for (ngx_chain_t *cl = chain; cl; cl = cl->next) {
        ngx_buf_t *buf = cl->buf;
        // logdf("%.*s", (int)(buf->last - buf->pos), buf->pos);
        for (u_char* pch = buf->pos; pch <= buf->last; ++pch) {
            // ctx->pattern_pending = pattern + pos;
        }
    }
}

References

NGINX References

Upvotes: 2

Views: 183

Answers (1)

samm
samm

Reputation: 712

I used a simple implementation (online).

/** match granularity is bytes, i.e. compare byte by byte.
 * @param mem the text to be searched
 * @param mem_len the text length
 * @param pattern the word sought
 * @param pattern_length the pattern length
 * @param pattern_index the continuing index from the last partial/whole match or 0
 * @return the past-the-end index after the text is looked through or the stop index if partial match occurs
 * @example size_t mem_idx, index = 0; // if matched, current matching range in mem is [mem_idx-(index-old_index), mem_idx-1].
 *  mem_idx = memmatch("ABC", 3, "FGH", 3, &index); // NotFound: mem_idx = 3; index = 0;
 *  mem_idx = memmatch("EFG", 3, "FGH", 3, &index); // Continue: mem_idx = 3; index = 2; mem[1,2]=pat[0,1]
 *  mem_idx = memmatch("HIJ", 3, "FGH", 3, &index); // Complete: mem_idx = 1; index = 3; mem[0,0]=pat[2,2]
 */
size_t memmatch(const void* mem, size_t mem_len, const void* pattern, size_t pattern_length, size_t* pattern_index) {
    assert(*pattern_index < pattern_length);
    size_t idx = 0; // do for loop on `mem`
    register size_t index = *pattern_index;
    for (; idx < mem_len;) {
        if (*((const char*)mem + idx++) == *(const char*)pattern + index) {
            ++index;
            if (pattern_length == index) {
                break; // ++idx;
            }
        } else if (index) {
            index = 0; // reset
        }
    }
    *pattern_index = index;
    return idx;
}

#ifdef MEMMATCH_EXAMPLE
void memmatch_example0() {
    size_t mem_idx, idx, index = 0;
    mem_idx = memmatch("ABC", 3, "FGH", 3, (idx = index, &index)); // NotFound
    std::cout << "mem_idx stops at " << mem_idx << ", pat_idx stops at " << index << " since " << idx << '\n';
    mem_idx = memmatch("EFG", 3, "FGH", 3, (idx = index, &index)); // Continue
    std::cout << "mem_idx stops at " << mem_idx << ", pat_idx stops at " << index << " since " << idx << '\n';
    mem_idx = memmatch("HIJ", 3, "FGH", 3, (idx = index, &index)); // Complete
    std::cout << "mem_idx stops at " << mem_idx << ", pat_idx stops at " << index << " since " << idx << '\n';
}
#endif

Upvotes: 0

Related Questions