codecowboy
codecowboy

Reputation: 10124

XSS filtering function in PHP

Does anyone know of a good function out there for filtering generic input from forms? Zend_Filter_input seems to require prior knowledge of the contents of the input and I'm concerned that using something like HTML Purifier will have a big performance impact.

What about something like:

function sacarXss($val) {
   $val = preg_replace('/([\x00-\x08][\x0b-\x0c][\x0e-\x20])/', '', $val);
   $search = 'abcdefghijklmnopqrstuvwxyz';
   $search .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
   $search .= '1234567890!@#$%^&*()';
   $search .= '~`";:?+/={}[]-_|\'\\';
   for ($i = 0; $i < strlen($search); $i++) {
      $val = preg_replace('/(&#[x|X]0{0,8}'.dechex(ord($search[$i])).';?)/i', $search[$i], $val); // with a ;
      $val = preg_replace('/(&#0{0,8}'.ord($search[$i]).';?)/', $search[$i], $val); // with a ;
   }
   $ra1 = Array('javascript', 'vbscript', 'expression', 'applet', 'meta', 'xml', 'blink', 'link', 'style', 'script', 'embed', 'object', 'iframe', 'frame', 'frameset', 'ilayer', 'layer', 'bgsound', 'title', 'base');
   $ra2 = Array('onabort', 'onactivate', 'onafterprint', 'onafterupdate', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut', 'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint', 'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'oncellchange', 'onchange', 'onclick', 'oncontextmenu', 'oncontrolselect', 'oncopy', 'oncut', 'ondataavailable', 'ondatasetchanged', 'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate', 'onfilterchange', 'onfinish', 'onfocus', 'onfocusin', 'onfocusout', 'onhelp', 'onkeydown', 'onkeypress', 'onkeyup', 'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter', 'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup', 'onmousewheel', 'onmove', 'onmoveend', 'onmovestart', 'onpaste', 'onpropertychange', 'onreadystatechange', 'onreset', 'onresize', 'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete', 'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange', 'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onunload');
   $ra = array_merge($ra1, $ra2);
   $found = true; 
   while ($found == true) {
      $val_before = $val;
      for ($i = 0; $i < sizeof($ra); $i++) {
         $pattern = '/';
         for ($j = 0; $j < strlen($ra[$i]); $j++) {
            if ($j > 0) {
               $pattern .= '(';
               $pattern .= '(&#[x|X]0{0,8}([9][a][b]);?)?';
               $pattern .= '|(&#0{0,8}([9][10][13]);?)?';
               $pattern .= ')?';
            }
            $pattern .= $ra[$i][$j];
         }
         $pattern .= '/i';
         $replacement = substr($ra[$i], 0, 2).'<x>'.substr($ra[$i], 2);
         $val = preg_replace($pattern, $replacement, $val);
         if ($val_before == $val) {
            $found = false;
         }
      }
   }
  return $val;
} 
echo sacarXss("testeando javascript:alert('hola');");

Upvotes: 47

Views: 123623

Answers (10)

user15041839
user15041839

Reputation:

I'm was collect most of issues by the web and combine stepping filter for all of them.

After some testing seems it works perfect:


/*
* Total XSS preventer class by Full-R
*
*/

final class xCleaner {

    public static function clean( string $html ): string {

        return self::cleanXSS(

            preg_replace(

                [

                    '/\s?<iframe[^>]*?>.*?<\/iframe>\s?/si',
                    '/\s?<style[^>]*?>.*?<\/style>\s?/si',
                    '/\s?<script[^>]*?>.*?<\/script>\s?/si',
                    '#\son\w*="[^"]+"#',

                ],

                [
                    '',
                    '',
                    ''
                ],

                $html

            )

        );

    }

    protected static function hexToSymbols( string $s ): string {

        return html_entity_decode($s, ENT_XML1, 'UTF-8');

    }

    protected static function escape( string $s, string $m = 'attr' ): string {

        preg_match_all('/data:\w+\/([a-zA-Z]*);base64,(?!_#_#_)([^)\'"]*)/mi', $s, $b64, PREG_OFFSET_CAPTURE);

        if( count( array_filter( $b64 ) ) > 0 ) {

            switch( $m ) {

                case 'attr':

                    $xclean = self::cleanXSS(

                                        urldecode(

                                            base64_decode(

                                                $b64[ 2 ][ 0 ][ 0 ]

                                            )

                                        )

                                );

                    break;

                case 'tag':

                    $xclean = self::cleanTagInnerXSS(

                                        urldecode(

                                            base64_decode(

                                                $b64[ 2 ][ 0 ][ 0 ]

                                            )

                                        )

                                );

                    break;

            }

            return substr_replace(

                $s,

                '_#_#_'. base64_encode( $xclean ),

                $b64[ 2 ][ 0 ][ 1 ],

                strlen( $b64[ 2 ][ 0 ][ 0 ] )

            );

        }
        else {

            return $s;

        }

    }

    protected static function cleanXSS( string $s ): string {

        // base64 injection prevention
        $st = self::escape( $s, 'attr' );

        return preg_replace([

                // JSON unicode
                '/\\\\u?{?([a-f0-9]{4,}?)}?/mi',                                                                    // [1] unicode JSON clean

                // Data b64 safe
                '/\*\w*\*/mi',                                                                                            // [2] unicode simple clean

                // Malware payloads
                '/:?e[\s]*x[\s]*p[\s]*r[\s]*e[\s]*s[\s]*s[\s]*i[\s]*o[\s]*n[\s]*(:|;|,)?\w*/mi',    // [3]  (:expression) evalution
                '/l[\s]*i[\s]*v[\s]*e[\s]*s[\s]*c[\s]*r[\s]*i[\s]*p[\s]*t[\s]*(:|;|,)?\w*/mi',         // [4]  (livescript:) evalution
                '/j[\s]*s[\s]*c[\s]*r[\s]*i[\s]*p[\s]*t[\s]*(:|;|,)?\w*/mi',                                 // [5]  (jscript:) evalution
                '/j[\s]*a[\s]*v[\s]*a[\s]*s[\s]*c[\s]*r[\s]*i[\s]*p[\s]*t[\s]*(:|;|,)?\w*/mi',       // [6]  (javascript:) evalution
                '/b[\s]*e[\s]*h[\s]*a[\s]*v[\s]*i[\s]*o[\s]*r[\s]*(:|;|,)?\w*/mi',                     // [7]  (behavior:) evalution
                '/v[\s]*b[\s]*s[\s]*c[\s]*r[\s]*i[\s]*p[\s]*t[\s]*(:|;|,)?\w*/mi',                      // [8]  (vsbscript:) evalution
                '/v[\s]*b[\s]*s[\s]*(:|;|,)?\w*/mi',                                                              // [9]  (vbs:) evalution
                '/e[\s]*c[\s]*m[\s]*a[\s]*s[\s]*c[\s]*r[\s]*i[\s]*p[\s]*t*(:|;|,)?\w*/mi',        // [10] (ecmascript:) possible ES evalution
                '/b[\s]*i[\s]*n[\s]*d[\s]*i[\s]*n[\s]*g*(:|;|,)?\w*/mi',                                 // [11] (-binding) payload
                '/\+\/v(8|9|\+|\/)?/mi',                                                                          // [12] (UTF-7 mutation)

                // Some entities
                '/&{\w*}\w*/mi',                                                                                   // [13] html entites clenup
                '/&#\d+;?/m',                                                                                      // [14] html entites clenup

                // Script tag encoding mutation issue
                '/\¼\/?\w*\¾\w*/mi',                                                                         // [21] mutation KOI-8
                '/\+ADw-\/?\w*\+AD4-\w*/mi',                                                         // [22] mutation old encodings

                '/\/*?%00*?\//m',

                // base64 escaped
                '/_#_#_/mi',                                                                                       // [23] base64 escaped marker cleanup
             
            ],

            // Replacements steps :: 23
            ['&#x$1;', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''],

            str_ireplace(

                ['\u0', '&colon;', '&tab;', '&newline;'],
                ['\0', ':', '', ''],

            // U-HEX prepare step
            self::hexToSymbols( $st ))

        );

    }

}

Also you can add Tidy markup correction to make HTML valid.

Upvotes: 0

Georg
Georg

Reputation: 1

I found a solution for my problem with the posts with german umlaut. To provide from totally cleaning (killing) the posts, i encode the incoming data:

    *$data = utf8_encode($data);
    ... function ...*

And at last i decode the output to get correct signs:

    *$data = utf8_decode($data);*

Now the post go through the filter function and i get a correct result...

Upvotes: -1

ymakux
ymakux

Reputation: 3495

All above methods don't allow to preserve some tags like <a>, <table> etc. There is an ultimate solution http://sourceforge.net/projects/kses/ Drupal uses it

Upvotes: 0

Doug Amos
Doug Amos

Reputation: 4383

htmlspecialchars() is perfectly adequate for filtering user input that is displayed in html forms.

Upvotes: 1

user3580379
user3580379

Reputation: 1

Try using for Clean XSS

xss_clean($data): "><script>alert(String.fromCharCode(74,111,104,116,111,32,82,111,98,98,105,101))</script>

Upvotes: -2

cletus
cletus

Reputation: 625485

Simple way? Use strip_tags():

$str = strip_tags($input);

You can also use filter_var() for that:

$str = filter_var($input, FILTER_SANITIZE_STRING);

The advantage of filter_var() is that you can control the behaviour by, for example, stripping or encoding low and high characters.

Here is a list of sanitizing filters.

Upvotes: 87

3eighty
3eighty

Reputation: 171

I have a similar problem. I need users to submit html content to a profile page with a great WYSIWYG editor (Redactorjs!), i wrote the following function to clean the submitted html:

    <?php function filterxss($str) {
//Initialize DOM:
$dom = new DOMDocument();
//Load content and add UTF8 hint:
$dom->loadHTML('<meta http-equiv="content-type" content="text/html; charset=utf-8">'.$str);
//Array holds allowed attributes and validation rules:
$check = array('src'=>'#(http://[^\s]+(?=\.(jpe?g|png|gif)))#i','href'=>'|^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i');
//Loop all elements:
foreach($dom->getElementsByTagName('*') as $node){
    for($i = $node->attributes->length -1; $i >= 0; $i--){
        //Get the attribute:
        $attribute = $node->attributes->item($i);
        //Check if attribute is allowed:
        if( in_array($attribute->name,array_keys($check))) {
            //Validate by regex:    
            if(!preg_match($check[$attribute->name],$attribute->value)) { 
                //No match? Remove the attribute
                $node->removeAttributeNode($attribute); 
            }
        }else{
            //Not allowed? Remove the attribute:
            $node->removeAttributeNode($attribute);
        }
    }
}
var_dump($dom->saveHTML()); } ?>

The $check array holds all the allowed attributes and validation rules. Maybe this is useful for some of you. I haven't tested is yet, so tips are welcome

Upvotes: 3

Taras
Taras

Reputation: 582

According to www.mcafeesecure.com General Solution for vulnerable to cross-site scripting (XSS) filter function can be:

function xss_cleaner($input_str) {
    $return_str = str_replace( array('<','>',"'",'"',')','('), array('&lt;','&gt;','&apos;','&#x22;','&#x29;','&#x28;'), $input_str );
    $return_str = str_ireplace( '%3Cscript', '', $return_str );
    return $return_str;
}

Upvotes: 0

ingnorant
ingnorant

Reputation: 29

function clean($data){
    $data = rawurldecode($data);
    return filter_var($data, FILTER_SANITIZE_SPEC_CHARS);
}

Upvotes: 2

Sarfraz
Sarfraz

Reputation: 382919

There are a number of ways hackers put to use for XSS attacks, PHP's built-in functions do not respond to all sorts of XSS attacks. Hence, functions such as strip_tags, filter_var, mysql_real_escape_string, htmlentities, htmlspecialchars, etc do not protect us 100%. You need a better mechanism, here is what is solution:

function xss_clean($data)
{
// Fix &entity\n;
$data = str_replace(array('&amp;','&lt;','&gt;'), array('&amp;amp;','&amp;lt;','&amp;gt;'), $data);
$data = preg_replace('/(&#*\w+)[\x00-\x20]+;/u', '$1;', $data);
$data = preg_replace('/(&#x*[0-9A-F]+);*/iu', '$1;', $data);
$data = html_entity_decode($data, ENT_COMPAT, 'UTF-8');

// Remove any attribute starting with "on" or xmlns
$data = preg_replace('#(<[^>]+?[\x00-\x20"\'])(?:on|xmlns)[^>]*+>#iu', '$1>', $data);

// Remove javascript: and vbscript: protocols
$data = preg_replace('#([a-z]*)[\x00-\x20]*=[\x00-\x20]*([`\'"]*)[\x00-\x20]*j[\x00-\x20]*a[\x00-\x20]*v[\x00-\x20]*a[\x00-\x20]*s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:#iu', '$1=$2nojavascript...', $data);
$data = preg_replace('#([a-z]*)[\x00-\x20]*=([\'"]*)[\x00-\x20]*v[\x00-\x20]*b[\x00-\x20]*s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:#iu', '$1=$2novbscript...', $data);
$data = preg_replace('#([a-z]*)[\x00-\x20]*=([\'"]*)[\x00-\x20]*-moz-binding[\x00-\x20]*:#u', '$1=$2nomozbinding...', $data);

// Only works in IE: <span style="width: expression(alert('Ping!'));"></span>
$data = preg_replace('#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*[`\'"]*.*?expression[\x00-\x20]*\([^>]*+>#i', '$1>', $data);
$data = preg_replace('#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*[`\'"]*.*?behaviour[\x00-\x20]*\([^>]*+>#i', '$1>', $data);
$data = preg_replace('#(<[^>]+?)style[\x00-\x20]*=[\x00-\x20]*[`\'"]*.*?s[\x00-\x20]*c[\x00-\x20]*r[\x00-\x20]*i[\x00-\x20]*p[\x00-\x20]*t[\x00-\x20]*:*[^>]*+>#iu', '$1>', $data);

// Remove namespaced elements (we do not need them)
$data = preg_replace('#</*\w+:\w[^>]*+>#i', '', $data);

do
{
    // Remove really unwanted tags
    $old_data = $data;
    $data = preg_replace('#</*(?:applet|b(?:ase|gsound|link)|embed|frame(?:set)?|i(?:frame|layer)|l(?:ayer|ink)|meta|object|s(?:cript|tyle)|title|xml)[^>]*+>#i', '', $data);
}
while ($old_data !== $data);

// we are done...
return $data;
}

Upvotes: 33

Related Questions