<?php
/**
 * CodeIgniter_Sniffs_Files_Utf8EncodingSniff.
 *
 * PHP version 5
 *
 * @category  PHP
 * @package   PHP_CodeSniffer
 * @author    Thomas Ernest <thomas.ernest@baobaz.com>
 * @copyright 2006 Thomas Ernest
 * @license   http://thomas.ernest.fr/developement/php_cs/licence GNU General Public License
 * @link      http://pear.php.net/package/PHP_CodeSniffer
 */

/**
 * CodeIgniter_Sniffs_Files_Utf8EncodingSniff.
 *
 * Ensures that PHP files are encoded with Unicode (UTF-8) encoding.
 *
 * @category  PHP
 * @package   PHP_CodeSniffer
 * @author    Thomas Ernest <thomas.ernest@baobaz.com>
 * @copyright 2006 Thomas Ernest
 * @license   http://thomas.ernest.fr/developement/php_cs/licence GNU General Public License
 * @link      http://pear.php.net/package/PHP_CodeSniffer
 */

namespace CodeIgniter\Sniffs\Files;

use PHP_CodeSniffer\Sniffs\Sniff;
use PHP_CodeSniffer\Files\File;

class Utf8EncodingSniff implements Sniff
{

    /**
     * Returns an array of tokens this test wants to listen for.
     *
     * @return array
     */
    public function register()
    {
        return array(
            T_OPEN_TAG
        );

    }//end register()


    /**
     * Processes this test, when one of its tokens is encountered.
     *
     * @param File $phpcsFile The current file being scanned.
     * @param int                  $stackPtr  The position of the current token
     *                                        in the stack passed in $tokens.
     *
     * @return void
     */
    public function process(File $phpcsFile, $stackPtr)
    {
        // We are only interested if this is the first open tag.
        if ($stackPtr !== 0) {
            if ($phpcsFile->findPrevious(T_OPEN_TAG, ($stackPtr - 1)) !== false) {
                return;
            }
        }

        $file_path = $phpcsFile->getFilename();
        $file_name = basename($file_path);
        $file_content = file_get_contents($file_path);
        if (false === mb_check_encoding($file_content, 'UTF-8')) {
            $error = 'File "' . $file_name . '" should be saved with Unicode (UTF-8) encoding.';
            $phpcsFile->addError($error, 0, 'utf8OrDie');
        }
        if ( ! self::_checkUtf8W3c($file_content)) {
            $error = 'File "' . $file_name . '" should be saved with Unicode (UTF-8) encoding, but it did not successfully pass the W3C test.';
            $phpcsFile->addError($error, 0, 'utf8OrDie1');
        }
        if ( ! self::_checkUtf8Rfc3629($file_content)) {
            $error = 'File "' . $file_name . '" should be saved with Unicode (UTF-8) encoding, but it did not meet RFC3629 requirements.';
            $phpcsFile->addError($error, 0, 'utf8OrDie2');
        }
    }//end process()


    /**
     * Checks that the string $content contains only valid UTF-8 chars
     * using W3C's method.
     * Returns true if $content contains only UTF-8 chars, false otherwise.
     *
     * @param string $content String to check.
     *
     * @return bool true if $content contains only UTF-8 chars, false otherwise.
     *
     * @see http://w3.org/International/questions/qa-forms-utf-8.html
     */
    private static function _checkUtf8W3c($content)
    {
        $content_chunks=self::mb_chunk_split($content, 4096, '');
    	foreach($content_chunks as $content_chunk)
		{
			$preg_result= preg_match(
            '%^(?:
                  [\x09\x0A\x0D\x20-\x7E]            # ASCII
                | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
                |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
                | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
                |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
                |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
                | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
                |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
            )*$%xs',
            $content_chunk
			);
			if($preg_result!==1)
			{
				return false;
			}

		}
		return true;
    }//end _checkUtf8W3c()

    /**
     * Checks that the string $content contains only valid UTF-8 chars
     * using the method described in RFC 3629.
     * Returns true if $content contains only UTF-8 chars, false otherwise.
     *
     * @param string $content String to check.
     *
     * @return bool true if $content contains only UTF-8 chars, false otherwise.
     *
     * @see http://www.php.net/manual/en/function.mb-detect-encoding.php#85294
     */
    private static function _checkUtf8Rfc3629($content)
    {
        $len = strlen($content);
        for ($i = 0; $i < $len; $i++) {
            $c = ord($content[$i]);
            if ($c > 128) {
                if (($c >= 254)) {
                    return false;
                } elseif ($c >= 252) {
                    $bits=6;
                } elseif ($c >= 248) {
                    $bits=5;
                } elseif ($c >= 240) {
                    $bytes = 4;
                } elseif ($c >= 224) {
                    $bytes = 3;
                } elseif ($c >= 192) {
                    $bytes = 2;
                } else {
                    return false;
                } if (($i + $bytes) > $len) {
                    return false;
                } while ($bytes > 1) {
                    $i++;
                    $b = ord($content[$i]);
                    if ($b < 128 || $b > 191) {
                        return false;
                    }
                    $bytes--;
                }
            }
        }
        return true;
    }//_checkUtf8Rfc3629()

	 /**
     * Splits a string to chunks of given size
	 * This helps to avoid segmentation fault errors when large text is given
     * Returns array of strings after splitting
     *
     * @param string $str String to split.
	 * @param int $len number of characters per chunk
     *
     * @return array string array after splitting
     *
     * @see http://php.net/manual/en/function.chunk-split.php
     */
	private static function mb_chunk_split($str, $len, $glue)
	{
		if (empty($str)) return false;
		$array = self::mbStringToArray ($str);
		$n = -1;
		$new = Array();
		foreach ($array as $char) {
			$n++;
			if ($n < $len) $new []= $char;
			elseif ($n == $len) {
				$new []= $glue . $char;
				$n = 0;
			}
		}
		return $new;
	}//mb_chunk_split
	/**
     * Supporting function for mb_chunk_split
     *
     * @param string $str
	 *
     * @return array
     *
     * @see http://php.net/manual/en/function.chunk-split.php
     */
	private static function mbStringToArray ($str)
	{
		if (empty($str)) return false;
		$len = mb_strlen($str);
		$array = array();
		for ($i = 0; $i < $len; $i++) {
			$array[] = mb_substr($str, $i, 1);
		}
		return $array;
	}



}//end class

?>