Omu
Omu

Reputation: 71288

automatically remove indentation, unneeded empty space from html (minify)

I have an asp.net-mvc application, and when I look at the page's source I see the html is indented, has lots of white space,

I think if I will remove all these spaces, my page will become smaller in KB

anybody knows how to remove them automatically ?

Upvotes: 1

Views: 2727

Answers (3)

Gup3rSuR4c
Gup3rSuR4c

Reputation: 9488

I'd like to recommend the following code. It works perfectly (I'm using it on several websites) and its simpler than @David's version:

using System;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.Mvc;

public class WhitespaceStrip : ActionFilterAttribute {
    public override void OnActionExecuting(
        ActionExecutingContext Context) {
        try {
            Context.HttpContext.Response.Filter = new WhitespaceFilter();
        } catch (Exception) {
            //  Ignore
        };
    }
}

public class WhitespaceFilter : MemoryStream {
    private HttpResponse Response = HttpContext.Current.Response;
    private Stream Filter = null;

    private string Source = string.Empty;
    private string[] ContentTypes = new string[1] {
        "text/html"
    };

    public WhitespaceFilter() {
        this.Filter = this.Response.Filter;
    }

    public override void Write(
        byte[] Buffer,
        int Offset,
        int Count) {
        this.Source = Encoding.UTF8.GetString(Buffer);

        if (this.ContentTypes.Contains(this.Response.ContentType)) {
            this.Response.ContentEncoding = Encoding.UTF8;

            this.Source = new Regex("(<pre>[^<>]*(((?<Open><)[^<>]*)+((?<Close-Open>>)[^<>]*)+)*(?(Open)(?!))</pre>)|\\s\\s+|[\\t\\n\\r]", RegexOptions.Compiled | RegexOptions.Singleline).Replace(this.Source, "$1");
            this.Source = new Regex("<!--.*?-->", RegexOptions.Compiled | RegexOptions.Singleline).Replace(this.Source, string.Empty);

            this.Filter.Write(Encoding.UTF8.GetBytes(this.Source), Offset, Encoding.UTF8.GetByteCount(this.Source));
        } else {
            this.Filter.Write(Encoding.UTF8.GetBytes(this.Source), Offset, Encoding.UTF8.GetByteCount(this.Source));
        };
    }
}

UPDATE

@Omu, just because it irritated me when you said it was "6x" slower, I set out to see if you're right or not. I ended up re-writing the filter and cleaning it up a bit, and then I ran some tests where I looped a table 10,000 to generate some white space and see how the filters work. When all was said and done, I saw no difference between the two regular expressions at all.

Now, if you're implying that the way the expressions work differently and that mine is going to be slower, than maybe there's some truth to that, but for you to see any differences you'll have to push out more than 1 MB sized HTML pages... That I hope is not what you're doing.

Furthermore, my expression preserves white space within <pre> elements...

All of that being said, here's my revised version:

using System;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.Mvc;

[AttributeUsage(AttributeTargets.Class, Inherited = true, AllowMultiple = false)]
internal class WhitespaceStripAttribute : ActionFilterAttribute {
    public override void OnActionExecuted(
        ActionExecutedContext ActionExecutedContext) {
        ActionExecutedContext.HttpContext.Response.Filter = new WhitespaceStream(ActionExecutedContext.HttpContext);
    }
}

internal class WhitespaceStream : MemoryStream {
    private readonly HttpContextBase HttpContext = null;
    private readonly Stream FilterStream = null;

    private readonly string[] ContentTypes = new string[1] {
        "text/html"
    };

    private static Regex WhitespaceRegex = new Regex("(<pre>[^<>]*(((?<Open><)[^<>]*)+((?<Close-Open>>)[^<>]*)+)*(?(Open)(?!))</pre>)|\\s\\s+|[\\t\\n\\r]", RegexOptions.Singleline | RegexOptions.Compiled);
    private static Regex CommentsRegex = new Regex("<!--.*?-->", RegexOptions.Singleline | RegexOptions.Compiled);

    public WhitespaceStream(
        HttpContextBase HttpContext) {
        this.HttpContext = HttpContext;
        this.FilterStream = HttpContext.Response.Filter;
    }

    public override void Write(
        byte[] Buffer,
        int Offset,
        int Count) {
        string Source = Encoding.UTF8.GetString(Buffer);

        if (this.ContentTypes.Any(
            ct =>
                (ct == this.HttpContext.Response.ContentType))) {
            this.HttpContext.Response.ContentEncoding = Encoding.UTF8;

            Source = WhitespaceRegex.Replace(Source, "$1");
            Source = CommentsRegex.Replace(Source, string.Empty);
        };

        this.FilterStream.Write(Encoding.UTF8.GetBytes(Source), Offset, Encoding.UTF8.GetByteCount(Source));
    }
}

Upvotes: 2

David
David

Reputation: 15360

Taken from http://madskristensen.net/post/A-whitespace-removal-HTTP-module-for-ASPNET-20.aspx.

[AttributeUsage(AttributeTargets.Class, Inherited = true, AllowMultiple = false)]
internal class WhiteSpaceFilterAttribute : ActionFilterAttribute
{
    public override void OnActionExecuting(ActionExecutingContext filterContext)
{
        filterContext.HttpContext.Response.Filter = new WhiteSpaceStream(filterContext.HttpContext.Response.Filter);
    }
}

internal class WhiteSpaceStream : Stream
{
    private Stream m_sink;
    private static Regex m_regex = new Regex(@"(?<=[^])\t{2,}|(?<=[>])\s{2,}(?=[<])|(?<=[>])\s{2,11}(?=[<])|(?=[\n])\s{2,}");
    //private static Regex m_regex = new Regex(@"^\s+", RegexOptions.Multiline | RegexOptions.Compiled); 

    public WhiteSpaceStream(Stream sink)
    {
        m_sink = sink;
    }

    public override bool CanRead
    {
        get { return true; }
    }

    public override bool CanSeek
    {
        get { return true; }
    }

    public override bool CanWrite
    {
        get { return true; }
    }

    public override void Flush()
    {
        m_sink.Flush();
    }

    public override long Length
    {
        get { return 0; }
    }

    private long _position;
    public override long Position
    {
        get { return _position; }
        set { _position = value; }
    }

    public override int Read(byte[] buffer, int offset, int count)
    {
        return m_sink.Read(buffer, offset, count);
    }

    public override long Seek(long offset, SeekOrigin origin)
    {
        return m_sink.Seek(offset, origin);
    }

    public override void SetLength(long value)
    {
        m_sink.SetLength(value);
    }

    public override void Close()
    {
        m_sink.Close();
    }

    public override void Write(byte[] buffer, int offset, int count)
    {
        byte[] data = new byte[count];
        Buffer.BlockCopy(buffer, offset, data, 0, count);
        string text = Encoding.Default.GetString(buffer);

        text = m_regex.Replace(text, string.Empty);

        byte[] outdata = System.Text.Encoding.Default.GetBytes(text);
        m_sink.Write(outdata, 0, outdata.GetLength(0));
    }

Upvotes: 2

Piskvor left the building
Piskvor left the building

Reputation: 92792

As in HTML, multiple whitespaces are treated as one, you could use a regular expression on your response:

/\s+/ /g

which converts any consecutive whitespaces to a single space.

Note that although this will decrease the size of uncompressed page, if you're gzipping the pages, the savings won't be that great.

Caveat: this could break inline JavaScript, as JS treats an endline as command delimiter (that is, like ;). If your JS uses ; for delimiting commands (most JS does), you should be OK.

Also, code samples in <pre> blocks will be affected, as whitespace is displayed there:

some   code   here {
  more          code }

becomes

some code here { more code }

Upvotes: 1

Related Questions