How to inject javascript in existing HTML response with node.js and cloudflare workers

Question

I have a vanity URL pointing to a GitBook. GitBook doesn't support the insertion of arbitrary javascript snippets. At the moment GitBook has 4 "integrations" only.

I could route through my own VM server to accomplish this, but I have CloudFlare and I want to try out workers. (Javascript running at the CDN edge).

The CloudFlare worker environment makes header injection very easy, but there is no obvious way to do this.

Kind Contributor · Accepted Answer

It's important to process with a TransformStream so that processing is async and doesn't require memory buffering (for scalability and to minimise GC) - there's only a 5ms CPU time budget.

Overview:

To use for yourself, change the strings forHeadStart, forHeadEnd, and forBodyEnd.
This deferredInjection approach is the recommended way that minimises CPU time for the worker. It's more efficient because it only needs to parse the very start of the HTML. The other approach requires parsing of the whole head section for headInjection, and if you use bodyInjection it practically needs to parse the whole html response.
The deferredInjection approach works by injecting the content into the start of the head tag, then on the client-side at runtime your HTML content will be deployed to the desired places.
You can inject directly if needed using headInjection and/or bodyInjection. Uncommenting related code, including code in injectScripts, and setting the strings for tagBytes that will be encoded.
This solution will only parse HTML content types
This solution works directly on bytes (not strings) for better efficiency. Searching for the bytes of the end-tag strings.
You could potentially target more end-tags, but usually you don't need to target more than these two
Processes data with streaming (the whole HTML string is not cached in memory). This lowers peak memory usage and speeds up time to first byte.
Handles a rare edge case where the closing tag is on a text read boundary. I believe a boundary might occur every ~1000 bytes (TCP packets 1000-1500 bytes each), and this can vary due to gzip compression.
Keeps the injection parsing code separate for the code to simply forward the rest for clarity.
You can disable the second body-tag injector by commenting it out if you don't need it - that will speed up processing.
I have tested this exact code for myself and it works. There might be remaining bugs (depending on location of closing tag, and depending if your server replies with partial html templates (body only)). I may have fixed one today 2019-06-28

Code

addEventListener('fetch', event => {
  event.passThroughOnException();
  event.respondWith(handleRequest(event.request))
})

/**
 * Fetch and log a request
 * @param {Request} request
 */
async function handleRequest(request) {
  const response = await fetch(request);

  var ctype = response.headers.get('content-type');
  if (ctype.startsWith('text/html') === false)
    return response; //Only parse html body

  let { readable, writable } = new TransformStream();
  let promise = injectScripts(response.body, writable);
  return new Response(readable, response);
}

let encoder = new TextEncoder('utf-8');

let deferredInjection = function() {
    let forHeadStart = ``;
    let forHeadEnd = ``;
    let forBodyEnd = ``;

    let helper = `
    ${forHeadStart}
    
    `;
    return {
        forInjection: encoder.encode(helper),
        tagBytes: encoder.encode(""),
        insertAfterTag: true
    };

}();

// let headInjection = {
    // forInjection: encoder.encode(""),
    // tagBytes: encoder.encode(""), //case sensitive
    // insertAfterTag: false
// };
// let bodyInjection = {
    // forInjection: encoder.encode(""),
    // tagBytes: encoder.encode(""), //case sensitive
    // insertAfterTag: false
// }

//console.log(bodyTagBytes);
encoder = null;

async function injectScripts(readable, writable) {
  let processingState = {
    readStream: readable,
    writeStream: writable,
    reader: readable.getReader(),
    writer: writable.getWriter(),
    leftOvers: null, //data left over after a closing tag is found
    inputDone: false,
    result: {charactersFound: 0, foundIndex: -1, afterHeadTag: -1} //Reused object for the duration of the request
  };


  await parseForInjection(processingState, deferredInjection);

  //await parseForInjection(processingState, headInjection);

  //await parseForInjection(processingState, bodyInjection);

  await forwardTheRest(processingState);      
}



///Return object will have foundIndex: -1, if there is no match, and no partial match at the end of the array
///If there is an exact match, return object will have charactersFound:(tagBytes.Length)
///If there is a partial match at the end of the array, return object charactersFound will be < (tagBytes.Length)
///The result object needs to be passed in to reduce Garbage Collection - we can reuse the object
function searchByteArrayChunkForClosingTag(chunk, tagBytes, result)
{   
  //console.log('search');
    let searchStart = 0;
  //console.log(tagBytes.length);
    //console.log(chunk.length);

    for (;;) {
        result.charactersFound = 0;
        result.foundIndex = -1;
        result.afterHeadTag = -1;
    //console.log(result);

        let sweepIndex = chunk.indexOf(tagBytes[0], searchStart);
        if (sweepIndex === -1)
            return; //Definitely not found

        result.foundIndex = sweepIndex;
        sweepIndex++;
        searchStart = sweepIndex; //where we start searching from next
        result.charactersFound++;   
        result.afterHeadTag = sweepIndex;

    //console.log(result);

        for (let i = 1; i < tagBytes.length; i++)
        {
            if (sweepIndex === chunk.length) return; //Partial match
            if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; } //Failed to match (even partially to boundary)
            result.charactersFound++;
            result.afterHeadTag = sweepIndex; //Because we work around the actual found tag in case it's across a boundary
        }   

    if (result.charactersFound === tagBytes.length)
          return; //Found
    }

}

function continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result)
{
  //console.log('continue');
    //Finish the search (no need to check the last buffer at all)
    //console.log('finish the search');
    result.charactersFound = lastSplitResult.charactersFound; //We'll be building on the progress from the lastSplitResult
    result.foundIndex = (-1 * result.charactersFound); //This won't be used, but a negative value is indicative of chunk spanning
    let sweepIndex = 0;
    result.afterHeadTag = 0;
    for (let i = lastSplitResult.charactersFound; i < tagBytes.length; i++) //Zero-based
    {
        if (sweepIndex === chunk.length) return result; //So we support working on a chunk that's smaller than the tagBytes search size
        if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; }
        result.charactersFound++;
        result.afterHeadTag = sweepIndex;
    }
}

function continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
{
  //console.log('continueOrNewSearch');
      if (lastSplitResult == null)
          searchByteArrayChunkForClosingTag(chunk, tagBytes, result);
      else
      {
          continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result);
        if (result.charactersFound === tagBytes.length)
            return result;
        else
            return searchByteArrayChunkForClosingTag(chunk, tagBytes, result); //Keep searching onward
      }
}

async function parseForInjection(processingState, injectionJob)
{
  if (processingState.inputDone) return; //Very edge case: Somehow  is never found?            
  if (!injectionJob) return;
  if (!injectionJob.tagBytes) return;
  if (!injectionJob.forInjection) return;

  let reader = processingState.reader;
  let writer = processingState.writer;
  let result = processingState.result;
  let tagBytes = injectionJob.tagBytes;
  //(reader, writer, tagBytes, forInjection)

  let lastSplitResult = null;
  let chunk = null;
  processingState.inputDone = false;
  for (;;) {
    if (processingState.leftOvers)
      {
      chunk = processingState.leftOvers;
      processingState.leftOvers = null;
      }
      else
      {
      let readerResult = await reader.read();
      chunk = readerResult.value;
      processingState.inputDone = readerResult.done;
      }

      if (processingState.inputDone) {
        if (lastSplitResult !== null) {
            //Very edge case: Somehow tagBytes is never found?            
            console.log('edge');
                  throw 'tag not found'; //Causing the system to fall back to the direct request
        }
        await writer.close();
        return true;
      }   
      //console.log(value.length);

        continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
      //console.log(result);

      if (result.charactersFound === tagBytes.length) //Complete match
      {
        //Inject
        //console.log('inject');
        if (result.foundIndex > 0)
        {
          let partValue = chunk.slice(0, result.foundIndex);
          //console.log(partValue);
          await writer.write(partValue);
        }
        console.log('injected');
        if (parseForInjection.insertAfterTag)
        {
            await writer.write(injectionJob.forInjection);
            await writer.write(injectionJob.tagBytes);
        }
        else
        {
            await writer.write(injectionJob.tagBytes);
            await writer.write(injectionJob.forInjection);
        }
        let remainder = chunk.slice(result.afterHeadTag, chunk.length - 1);
        processingState.leftOvers = remainder;
        lastSplitResult = null;
        return;
      }

      if (lastSplitResult !== null)
      {
        //console.log('no match over boundary');
        //The remainder wasn't found, so write the partial match from before (maybe `<` or ` is never found?            

  if (processingState.leftOvers)
  {
    chunk = processingState.leftOvers;
    await processingState.writer.write(chunk);
  }

  processingState.reader.releaseLock();
  processingState.writer.releaseLock();

  await processingState.readStream.pipeTo(processingState.writeStream);

  //Should there be an explicit close method called? I couldn't find one
  }
  catch (e)
  {
    console.log(e);
  }
}

Further explanation of working directly with (utf-8) bytes:

Only working with byte values. This is possible at least by searching for the first distinctive utf-8 byte of a character (< 128 and > 192). But in this case, we're searching for which is made up of lower-than-128 bytes, very easy to work with.
Given the nature of searching for utf-8 (which is the trickiest), this should work with ['utf-8', 'utf8', 'iso-8859-1', 'us-ascii']. You will need to change the snippet encoder to match.
This isn't thoroughly tested. The boundary case, didn't trigger for me. Ideally, we would have a testing rig for the core functions
thanks to Kenton Varda for challenging me
Please let me know if there's a CloudFlare workers way to do pipeTo in the forwardTheRest function
You might find continueOrNewSearch and the two sub-functions to be an interesting approach to finding multi-bytes across a chunk boundary. Up until the boundary we just count how many bytes are found. There's no need to keep those bytes (we know what they are). Then on the next chunk we continue where we left off. We always cut the array buffer around the header, and make sure we write the header bytes (using the tagBytes)

How to inject javascript in existing HTML response with node.js and cloudflare workers

Answers (1)

Related Questions