From 889dce97b7be538f122486165e32807d7ec332e4 Mon Sep 17 00:00:00 2001 From: Vincent Vanwaelscappel Date: Tue, 28 Jan 2020 18:54:27 +0100 Subject: [PATCH] wip #3373 @1 --- composer.json | 2 +- src/Cubist/Util/Text.php | 1895 +++++++++++++++++++------------------- 2 files changed, 944 insertions(+), 953 deletions(-) diff --git a/composer.json b/composer.json index 269eaf4..4ee9193 100644 --- a/composer.json +++ b/composer.json @@ -29,7 +29,7 @@ "ext-simplexml": "*", "ext-json": "*", "ext-iconv": "*", - "laminas/laminas-filter": "^2.9", + "illuminate/support": "~5.8|^6.0", "cubist/net": "dev-master" } } diff --git a/src/Cubist/Util/Text.php b/src/Cubist/Util/Text.php index 8382125..55a9355 100644 --- a/src/Cubist/Util/Text.php +++ b/src/Cubist/Util/Text.php @@ -1,960 +1,951 @@ $p) { - $str = preg_replace('|[' . $p . ']|u', $r, $str); - } - return $str; - } - - public static function getAccentsPattern() - { - $pattern = array(); - $pattern['A'] = '\x{00C0}-\x{00C5}'; - $pattern['AE'] = '\x{00C6}'; - $pattern['C'] = '\x{00C7}'; - $pattern['D'] = '\x{00D0}'; - $pattern['E'] = '\x{00C8}-\x{00CB}'; - $pattern['I'] = '\x{00CC}-\x{00CF}'; - $pattern['N'] = '\x{00D1}'; - $pattern['O'] = '\x{00D2}-\x{00D6}\x{00D8}'; - $pattern['OE'] = '\x{0152}'; - $pattern['S'] = '\x{0160}'; - $pattern['U'] = '\x{00D9}-\x{00DC}'; - $pattern['Y'] = '\x{00DD}'; - $pattern['Z'] = '\x{017D}'; - - $pattern['a'] = '\x{00E0}-\x{00E5}'; - $pattern['ae'] = '\x{00E6}'; - $pattern['c'] = '\x{00E7}'; - $pattern['d'] = '\x{00F0}'; - $pattern['e'] = '\x{00E8}-\x{00EB}'; - $pattern['i'] = '\x{00EC}-\x{00EF}'; - $pattern['n'] = '\x{00F1}'; - $pattern['o'] = '\x{00F2}-\x{00F6}\x{00F8}'; - $pattern['oe'] = '\x{0153}'; - $pattern['s'] = '\x{0161}'; - $pattern['u'] = '\x{00F9}-\x{00FC}'; - $pattern['y'] = '\x{00FD}\x{00FF}'; - $pattern['z'] = '\x{017E}'; - - $pattern['ss'] = '\x{00DF}'; - return $pattern; - } - - public static function removeAccents($str, $clean = true) - { - $pattern = self::getAccentsPattern(); - if ($clean) { - $str = self::cleanUTF8($str); - $del = array('’' => ' ', '”' => ' ', '“' => ' ', '•' => ' ', '…' => ' ', '€' => ' ', - '–' => ' ', '‘' => ' '); - foreach ($del as $d => $p) { - $str = str_replace($d, $p, $str); - } - } - foreach ($pattern as $r => $p) { - $str = preg_replace('/[' . $p . ']/u', $r, $str); - } - - $from = 'o'; - $to = 'o'; - - $str = strtr($str, $from, $to); - - return $str; - } - - public static function keepOnlyLettersAndDigits($str) - { - return self::condenseWhite(preg_replace('|[^0-9A-Za-z]|ui', ' ', self::removeAccents($str))); - } - - public static function makeAccentInsensiblePattern($str) - { - $patterns = self::getAccentsPattern(); - $chars = preg_split('//ui', $str, -1, PREG_SPLIT_NO_EMPTY); - $pattern = '|'; - foreach ($chars as $char) { - if (isset($patterns[$char])) { - $pattern .= '['; - $pattern .= $char; - $pattern .= $patterns[$char]; - $pattern .= ']{1}'; - } else { - $pattern .= $char; - } - } - $pattern .= '|iu'; - return $pattern; - } - - public static function preg_areplace($search, $replace, $subject) - { - $pattern = self::makeAccentInsensiblePattern($search); - return preg_replace($pattern, $replace, $subject); - } - - public static function multiExplode($separator, $str, $limit = null) - { - $seps = array('§', '£', '¤', '#', '¨', '^', '%'); - foreach ($seps as $sep) { - if (stristr($str, $sep)) { - continue; - } - break; - } - - $str = preg_replace('|[' . preg_quote($separator, '-') . ']|', $sep, $str); - if (is_null($limit)) { - return explode($sep, $str); - } else { - return explode($sep, $str, $limit); - } - } - - public static function countWords($str) - { - return count(preg_split('|\s|', $str)); - } - - public static function explodeNewLines($str) - { - $str = trim($str); - if ($str === '') { - return []; - } - $str = self::condenseNewLines($str); - return preg_split('|\v|', $str); - } - - public static function substrWord($str, $words, $end = '', $wordsorig = null) - { - if (is_null($wordsorig)) { - $wordsorig = $words; - } - - $maxchars = $wordsorig * 6; - - $o = self::countWords($str); - if ($o <= $words) { - $res = $str; - $addend = false; - } else { - $e = self::multiExplode(" \n", $str, $words); - array_pop($e); - $res = implode(' ', $e); - $addend = true; - } - if (mb_strlen($res) > $maxchars) { - return self::substrWord($str, $words - 1, $end, $wordsorig); - } - - if ($addend) { - $res .= $end; - } - - return $res; - } - - public static function substrWordChars($str, $chars, $end = '') - { - if (strlen($str) <= $chars) { - return $str; - } - - $str = trim(substr($str, 0, $chars)); - $s = preg_split('|\s+|', $str); - array_pop($s); - return implode(' ', $s) . $end; - } - - public static function ucfirst($str, $lower = false) - { - if ($lower) { - $str = mb_strtolower($str); - } - $first = mb_substr($str, 0, 1); - $suite = mb_substr($str, 1); - return mb_strtoupper($first) . $suite; - } - - public static function removeNl($str) - { - $trans = array("\n" => ' ', "\r" => ' '); - $str = strtr($str, $trans); - return self::condenseWhite($str); - } - - public static function condenseWhite($str) - { - return preg_replace('|[\s]{2,100}|u', ' ', $str); - } - - public static function condenseNewLines($str) - { - $str = self::normalizeLines($str); - $str = preg_replace('|\n{2,100}|', "\n", $str); - return $str; - } - - public static function html2text($str) - { - $res = self::strip_tags($str); - $res = str_replace(' ', ' ', $res); - - return $res; - } - - public static function strip_tags($str, $allowed_tags = array(), $trim = false) - { - // return preg_replace('|\<.*\>|uU', '', $str); - // http://www.php.net/manual/fr/function.strip-tags.php#86463 - if (!is_array($allowed_tags)) { - $allowed_tags = !empty($allowed_tags) ? array($allowed_tags) : array(); - } - $tags = implode('|', $allowed_tags); - - if (empty($tags)) { - $tags = '[a-z]+'; - } - - preg_match_all('@@i', $str, $matches); - - $full_tags = $matches[0]; - $tag_names = $matches[1]; - - foreach ($full_tags as $i => $full_tag) { - if (!in_array($tag_names[$i], $allowed_tags)) { - if ($trim) { - unset($full_tags[$i]); - } else { - $str = str_replace($full_tag, '', $str); - } - } - } - - return $trim ? implode('', $full_tags) : $str; - } - - public static function str2URL($str, $replace = '-', $exclude_slashs = false, $exclude_dots = false) - { - if (is_object($str)) { - $str = json_encode($str); - } - $str = str_replace('&', '&', $str); - $str = str_replace(':', ' ', $str); - if (!$exclude_slashs) { - $str = str_replace('/', ' ', $str); - } - - $str = self::deaccent($str); - $str = preg_replace('/[^A-Za-z0-9_\s\'\:\/[\]-]/', '', $str); - - return self::tidyURL($str, true); - - } - - public static function cleanUTF8($str, $replace = '?') - { - while (($bad_index = self::utf8badFind($str)) !== false) { - $str = substr_replace($str, $replace, $bad_index, 1); - } - $str = str_replace('', $replace, $str); - $str = str_replace('', $replace, $str); - return $str; - } - - public static function getChar($code) - { - $code = trim($code, '&;'); - return html_entity_decode('&' . $code . ';', ENT_QUOTES, 'UTF-8'); - } - - public static function randText($length = 300) - { - $str = 'aeiouy azertyuiopqsdfghjklmwxcvbn eaiouaeiou '; - $list = str_split($str); - $nb = strlen($str) - 1; - $res = ''; - for ($i = 0; $i <= $length; $i++) { - $pos = rand(0, $nb); - $res .= $list[$pos]; - } - return $res; - } - - public static function splitWordsWithCase($str) - { - $non_word = '\x{0000}-\x{002F}\x{003A}-\x{0040}\x{005b}-\x{0060}\x{007B}-\x{007E}\x{00A0}-\x{00BF}\s'; - if (preg_match_all('/([^' . $non_word . ']{3,})/msu', html::clean($str), $match)) { - foreach ($match[1] as $i => $v) { - $match[1][$i] = $v; - } - return $match[1]; - } - return array(); - } - - public static function find_words_from_list($str, $list) - { - $words = array_unique(self::splitWordsWithCase($str));; - if (is_array($list)) { - $liste = $list; - } else { - $liste = array_unique(self::splitWords($list)); - } - - $l = array(); - foreach ($words as $ll) { - $lll = self::removeAccents($ll); - $lll = strtolower($lll); - $liste_real[$lll][] = $ll; - $l[] = $lll; - } - - $diff = array_intersect($liste, $l); - $res = array(); - if ($diff) { - foreach ($diff as $d) { - $res = array_merge($res, $liste_real[$d]); - } - return $res; - } - return false; - } - - public static function mb_str_split($string) - { - $stop = mb_strlen($string); - $result = array(); - - for ($idx = 0; $idx < $stop; $idx++) { - $result[] = mb_substr($string, $idx, 1); - } - - return $result; - } - - public static function strToArray($str) - { - return self::mb_str_split($str); - } - - public static function utf8ToUnicode($str) - { - $mState = 0; // cached expected number of octets after the current octet - // until the beginning of the next UTF8 character sequence - $mUcs4 = 0; // cached Unicode character - $mBytes = 1; // cached expected number of octets in the current sequence - - $out = array(); - - $len = strlen($str); - for ($i = 0; $i < $len; $i++) { - $in = ord($str{$i}); - if (0 == $mState) { - // When mState is zero we expect either a US-ASCII character or a - // multi-octet sequence. - if (0 == (0x80 & ($in))) { - // US-ASCII, pass straight through. - $out[] = $in; - $mBytes = 1; - } else if (0xC0 == (0xE0 & ($in))) { - // First octet of 2 octet sequence - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x1F) << 6; - $mState = 1; - $mBytes = 2; - } else if (0xE0 == (0xF0 & ($in))) { - // First octet of 3 octet sequence - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x0F) << 12; - $mState = 2; - $mBytes = 3; - } else if (0xF0 == (0xF8 & ($in))) { - // First octet of 4 octet sequence - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x07) << 18; - $mState = 3; - $mBytes = 4; - } else if (0xF8 == (0xFC & ($in))) { - /* First octet of 5 octet sequence. - * - * This is illegal because the encoded codepoint must be either - * (a) not the shortest form or - * (b) outside the Unicode range of 0-0x10FFFF. - * Rather than trying to resynchronize, we will carry on until the end - * of the sequence and let the later error handling code catch it. - */ - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x03) << 24; - $mState = 4; - $mBytes = 5; - } else if (0xFC == (0xFE & ($in))) { - // First octet of 6 octet sequence, see comments for 5 octet sequence. - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 1) << 30; - $mState = 5; - $mBytes = 6; - } else { - /* Current octet is neither in the US-ASCII range nor a legal first - * octet of a multi-octet sequence. - */ - return false; - } - } else { - // When mState is non-zero, we expect a continuation of the multi-octet - // sequence - if (0x80 == (0xC0 & ($in))) { - // Legal continuation. - $shift = ($mState - 1) * 6; - $tmp = $in; - $tmp = ($tmp & 0x0000003F) << $shift; - $mUcs4 |= $tmp; - - if (0 == --$mState) { - /* End of the multi-octet sequence. mUcs4 now contains the final - * Unicode codepoint to be output - * - * Check for illegal sequences and codepoints. - */ - // From Unicode 3.1, non-shortest form is illegal - if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || - ((3 == $mBytes) && ($mUcs4 < 0x0800)) || - ((4 == $mBytes) && ($mUcs4 < 0x10000)) || - (4 < $mBytes) || - // From Unicode 3.2, surrogate characters are illegal - (($mUcs4 & 0xFFFFF800) == 0xD800) || - // Codepoints outside the Unicode range are illegal - ($mUcs4 > 0x10FFFF) - ) { - return false; - } - if (0xFEFF != $mUcs4) { - // BOM is legal but we don't want to output it - $out[] = $mUcs4; - } - // initialize UTF8 cache - $mState = 0; - $mUcs4 = 0; - $mBytes = 1; - } - } else { - /* ((0xC0 & (*in) != 0x80) && (mState != 0)) - * - * Incomplete multi-octet sequence. - */ - return false; - } - } - } - return $out; - } - - /** - * Takes an array of ints representing the Unicode characters and returns - * a UTF-8 string. Astral planes are supported ie. the ints in the - * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates - * are not allowed. - * - * Returns false if the input array contains ints that represent - * surrogates or are outside the Unicode range. - */ - public static function unicodeToUtf8($arr) - { - $dest = ''; - foreach ($arr as $src) { - if ($src < 0) { - return false; - } else if ($src <= 0x007f) { - $dest .= chr($src); - } else if ($src <= 0x07ff) { - $dest .= chr(0xc0 | ($src >> 6)); - $dest .= chr(0x80 | ($src & 0x003f)); - } else if ($src == 0xFEFF) { - // nop -- zap the BOM - } else if ($src >= 0xD800 && $src <= 0xDFFF) { - // found a surrogate - return false; - } else if ($src <= 0xffff) { - $dest .= chr(0xe0 | ($src >> 12)); - $dest .= chr(0x80 | (($src >> 6) & 0x003f)); - $dest .= chr(0x80 | ($src & 0x003f)); - } else if ($src <= 0x10ffff) { - $dest .= chr(0xf0 | ($src >> 18)); - $dest .= chr(0x80 | (($src >> 12) & 0x3f)); - $dest .= chr(0x80 | (($src >> 6) & 0x3f)); - $dest .= chr(0x80 | ($src & 0x3f)); - } else { - // out of range - return false; - } - } - return $dest; - } - - public static function uchr($n) - { - return self::unicodeToUtf8(array($n)); - } - - public static function uord($c) - { - $r = self::utf8ToUnicode($c); - return array_shift($r); - } - - public static function strcmp($s1, $s2, $ignoreCase = false, $ignoreAccents = false, $trim = false) - { - if ($trim !== false) { - $s1 = trim($s1, $trim); - $s2 = trim($s2, $trim); - } - if ($ignoreAccents) { - $s1 = self::removeAccents($s1); - $s2 = self::removeAccents($s2); - } - if ($ignoreCase) { - $s1 = mb_strtolower($s1); - $s2 = mb_strtolower($s2); - } - - return strcmp($s1, $s2); - } - - public static function removeNewLines($input) - { - $res = preg_replace("|\s+|", ' ', $input); - return $res; - } - - /** - * - * @param string $str - * @param boolean $compact - * @return array - */ - public static function splitLines($str, $compact = true) - { - $str = str_replace("\r\n", "\n", $str); - $str = str_replace("\r", "\n", $str); - $str = explode("\n", $str); - - if (!$compact) { - return $str; - } - - $res = array(); - foreach ($str as $s) { - $s = trim($s); - if ($s == '') { - continue; - } - $res[] = $s; - } - return $res; - } - - public static function parseUrl($url, $forceScheme = true) - { - $url = trim($url); - if (substr($url, 0, 2) == '//') { - $url = 'http:' . $url; - } - $res = parse_url($url); - if ($forceScheme && !isset($res['scheme'])) { - $url = 'http://' . $url; - $res = parse_url($url); - } - - if (isset($res['query'])) { - parse_str($res['query'], $tmp); - $res['query_params'] = $tmp; - } - - if (isset($res['path'])) { - $components = explode('/', trim($res['path'], '/')); - $filteredComponents = array(); - foreach ($components as $c) { - if ($c == '') { - continue; - } - $filteredComponents[] = $c; - } - $res['path_components'] = $filteredComponents; - } - return $res; - } - - public static function pluriel($nb, $singulier, $pluriel, $zero = false, $displayNb = true) - { - $nb = intval($nb); - $res = ''; - if ($displayNb) { - $res .= $nb . ' '; - } - if ($nb == 0 && $zero) { - return $zero; - } - if ($nb <= 1) { - $res .= $singulier; - } else { - $res .= $pluriel; - } - return $res; - } - - public static function normalizeLines($text, $os = 'nix') - { - $text = str_replace("\r\n", "\n", $text); - $text = str_replace("\r", "\n", $text); - if ($os == 'win') { - return str_replace("\n", "\r\n", $text); - } - return $text; - } - - public static function underscoreToCamelCase($str, $upperFirst = false) - { - $inflector = new Zend_Filter_Inflector(':string'); - $inflector->addRules(array(':string' => array('Word_UnderscoreToCamelCase'))); - $str = $inflector->filter(array('string' => $str)); - if (!$upperFirst) { - $str{0} = mb_strtolower($str{0}); - } - return $str; - } - - public static function camelCaseToUnderscore($str) - { - - preg_match_all('!([A-Z][A-Z0-9]*(?=$|[A-Z][a-z0-9])|[A-Za-z][a-z0-9]+)!', $str, $matches); - $ret = $matches[0]; - foreach ($ret as &$match) { - $match = $match == strtoupper($match) ? strtolower($match) : lcfirst($match); - } - return implode('_', $ret); - } - - // Stops orphans in HTML by replacing the last space with a   - public static function preventOrphans($str) - { - - $find = ' '; // What to search for - $replace = ' '; // What to replace it with - - $last_space = strrpos($str, $find); // Find last occurrence in string - - if ($last_space !== false) { - $str = substr_replace($str, $replace, $last_space, strlen($find)); - } - - // Also replace punctuation that has spaces before it (eg. in French) - $punctuations = array(' :', ' !', ' ?', '« ', ' »'); - $replacements = array("{$replace}:", "{$replace}!", "{$replace}?", "«{$replace}", "{$replace}»"); - $str = str_replace($punctuations, $replacements, $str); - - return $str; - } - - /** - * Check email address - * - * Returns true if $email is a valid email address. - * - * @copyright Cal Henderson - * @license http://creativecommons.org/licenses/by-sa/2.5/ CC-BY-SA - * @link http://www.iamcal.com/publish/articles/php/parsing_email/ - * - * @param string $email Email string - * @return boolean - */ - public static function isEmail($email) - { - $qtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'; - $dtext = '[^\\x0d\\x5b-\\x5d\\x80-\\xff]'; - $atom = '[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+'; - $quoted_pair = '\\x5c[\\x00-\\x7f]'; - $domain_literal = "\\x5b($dtext|$quoted_pair)*\\x5d"; - $quoted_string = "\\x22($qtext|$quoted_pair)*\\x22"; - $domain_ref = $atom; - $sub_domain = "($domain_ref|$domain_literal)"; - $word = "($atom|$quoted_string)"; - $domain = "$sub_domain(\\x2e$sub_domain)*"; - $local_part = "$word(\\x2e$word)*"; - $addr_spec = "$local_part\\x40$domain"; - - return (boolean)preg_match("!^$addr_spec$!", $email); - } - - /** - * Accents replacement - * - * Replaces some occidental accentuated characters by their ASCII - * representation. - * - * @param string $str String to deaccent - * @return string - */ - public static function deaccent($str) - { - $pattern['A'] = '\x{00C0}-\x{00C5}'; - $pattern['AE'] = '\x{00C6}'; - $pattern['C'] = '\x{00C7}'; - $pattern['D'] = '\x{00D0}'; - $pattern['E'] = '\x{00C8}-\x{00CB}'; - $pattern['I'] = '\x{00CC}-\x{00CF}'; - $pattern['N'] = '\x{00D1}'; - $pattern['O'] = '\x{00D2}-\x{00D6}\x{00D8}'; - $pattern['OE'] = '\x{0152}'; - $pattern['S'] = '\x{0160}'; - $pattern['U'] = '\x{00D9}-\x{00DC}'; - $pattern['Y'] = '\x{00DD}'; - $pattern['Z'] = '\x{017D}'; - - $pattern['a'] = '\x{00E0}-\x{00E5}'; - $pattern['ae'] = '\x{00E6}'; - $pattern['c'] = '\x{00E7}'; - $pattern['d'] = '\x{00F0}'; - $pattern['e'] = '\x{00E8}-\x{00EB}'; - $pattern['i'] = '\x{00EC}-\x{00EF}'; - $pattern['n'] = '\x{00F1}'; - $pattern['o'] = '\x{00F2}-\x{00F6}\x{00F8}'; - $pattern['oe'] = '\x{0153}'; - $pattern['s'] = '\x{0161}'; - $pattern['u'] = '\x{00F9}-\x{00FC}'; - $pattern['y'] = '\x{00FD}\x{00FF}'; - $pattern['z'] = '\x{017E}'; - - $pattern['ss'] = '\x{00DF}'; - - foreach ($pattern as $r => $p) { - $str = preg_replace('/[' . $p . ']/u', $r, $str); - } - - return $str; - } - - /** - * URL cleanup - * - * @param string $str URL to tidy - * @param boolean $keep_slashes Keep slashes in URL - * @param boolean $keep_spaces Keep spaces in URL - * @return string - */ - public static function tidyURL($str, $keep_slashes = true, $keep_spaces = false) - { - $str = strip_tags($str); - $str = str_replace(array('?', '&', '#', '=', '+', '<', '>', '"', '%'), '', $str); - $str = str_replace("'", ' ', $str); - $str = preg_replace('/[\s]+/u', ' ', trim($str)); - - if (!$keep_slashes) { - $str = str_replace('/', '-', $str); - } - - if (!$keep_spaces) { - $str = str_replace(' ', '-', $str); - } - - $str = preg_replace('/[-]+/', '-', $str); - - # Remove path changes in URL - $str = preg_replace('%^/%', '', $str); - $str = preg_replace('%\.+/%', '', $str); - - return $str; - } - - /** - * Cut string - * - * Returns a cuted string on spaced at given length $l. - * - * @param string $str String to cut - * @param integer $l Length to keep - * @return string - */ - public static function cutString($str, $l) - { - $s = preg_split('/([\s]+)/u', $str, -1, PREG_SPLIT_DELIM_CAPTURE); - - $res = ''; - $L = 0; - - if (mb_strlen($s[0]) >= $l) { - return mb_substr($s[0], 0, $l); - } - - foreach ($s as $v) { - $L = $L + mb_strlen($v); - - if ($L > $l) { - break; - } else { - $res .= $v; - } - } - - return trim($res); - } - - /** - * Split words - * - * Returns an array of words from a given string. - * - * @param string $str Words to split - * @return array - */ - public static function splitWords($str, $minChar = 3) - { - $non_word = '\x{0000}-\x{002F}\x{003A}-\x{0040}\x{005b}-\x{0060}\x{007B}-\x{007E}\x{00A0}-\x{00BF}\s'; - if (preg_match_all('/([^' . $non_word . ']{' . $minChar . ',})/msu', html::clean($str), $match)) { - foreach ($match[1] as $i => $v) { - $match[1][$i] = mb_strtolower($v); - } - return $match[1]; - } - return array(); - } - - /** - * Encoding detection - * - * Returns the encoding (in lowercase) of given $str. - * - * @param string $str String - * @return string - */ - public static function detectEncoding($str) - { - return strtolower(mb_detect_encoding($str . ' ', - 'UTF-8,ISO-8859-1,ISO-8859-2,ISO-8859-3,' . - 'ISO-8859-4,ISO-8859-5,ISO-8859-6,ISO-8859-7,ISO-8859-8,' . - 'ISO-8859-9,ISO-8859-10,ISO-8859-13,ISO-8859-14,ISO-8859-15')); - } - - /** - * Find bad UTF8 tokens - * - * Locates the first bad byte in a UTF-8 string returning it's - * byte index in the string - * PCRE Pattern to locate bad bytes in a UTF-8 string - * Comes from W3 FAQ: Multilingual Forms - * Note: modified to include full ASCII range including control chars - * - * @copyright Harry Fuecks - * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html GNU LGPL 2.1 - * @link http://phputf8.sourceforge.net - * - * @param string $str String to search - * @return integer|false - */ - public static function utf8badFind($str) - { - $UTF8_BAD = - '([\x00-\x7F]' . # ASCII (including control chars) - '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte - '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs - '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte - '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates - '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 - '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 - '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 - '|(.{1}))'; # invalid byte - $pos = 0; - $badList = array(); - - while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { - $bytes = strlen($matches[0]); - if (isset($matches[2])) { - return $pos; - } - $pos += $bytes; - $str = substr($str, $bytes); - } - return false; - } - - - /** - * BOM removal - * - * Removes BOM from the begining of a string if present. - * - * @param string $str String to clean - * @return string - */ - public static function removeBOM($str) - { - if (substr_count($str, '')) { - return str_replace('', '', $str); - } - - return $str; - } - - /** - * Quoted printable conversion - * - * Encodes given str to quoted printable - * - * @param string $str String to encode - * @return string - */ - public static function QPEncode($str) - { - $res = ''; - - foreach (preg_split("/\r?\n/msu", $str) as $line) { - $l = ''; - preg_match_all('/./', $line, $m); - - foreach ($m[0] as $c) { - $a = ord($c); - - if ($a < 32 || $a == 61 || $a > 126) { - $c = sprintf('=%02X', $a); - } - - $l .= $c; - } - - $res .= $l . "\r\n"; - } - return $res; - } + public static function utf8_encode($text, $from = 'ISO-8859-1') + { + return self::toUTF8($text, $from); + } + + public static function toUTF8($str, $encoding = null) + { + if (!$encoding) { + $encoding = self::detectEncoding($str); + } + + $str = iconv($encoding, 'UTF-8//TRANSLIT', $str); + return self::removeOddStuff($str); + } + + public static function removeOddStuff($str) + { + $pattern = array(); + $pattern["'"] = '\x{0092}\x{00b4}\x{0060}\x{2018}\x{2019}'; + $pattern['oe'] = '\x{009c}'; + $pattern['...'] = '\x{0085}'; + $pattern['Oe'] = '\x{008c}'; + $pattern[' '] = '\x{0096}'; + $pattern['«'] = '\x{0093}'; + $pattern['»'] = '\x{0094}'; + + foreach ($pattern as $r => $p) { + $str = preg_replace('|[' . $p . ']|u', $r, $str); + } + return $str; + } + + public static function getAccentsPattern() + { + $pattern = array(); + $pattern['A'] = '\x{00C0}-\x{00C5}'; + $pattern['AE'] = '\x{00C6}'; + $pattern['C'] = '\x{00C7}'; + $pattern['D'] = '\x{00D0}'; + $pattern['E'] = '\x{00C8}-\x{00CB}'; + $pattern['I'] = '\x{00CC}-\x{00CF}'; + $pattern['N'] = '\x{00D1}'; + $pattern['O'] = '\x{00D2}-\x{00D6}\x{00D8}'; + $pattern['OE'] = '\x{0152}'; + $pattern['S'] = '\x{0160}'; + $pattern['U'] = '\x{00D9}-\x{00DC}'; + $pattern['Y'] = '\x{00DD}'; + $pattern['Z'] = '\x{017D}'; + + $pattern['a'] = '\x{00E0}-\x{00E5}'; + $pattern['ae'] = '\x{00E6}'; + $pattern['c'] = '\x{00E7}'; + $pattern['d'] = '\x{00F0}'; + $pattern['e'] = '\x{00E8}-\x{00EB}'; + $pattern['i'] = '\x{00EC}-\x{00EF}'; + $pattern['n'] = '\x{00F1}'; + $pattern['o'] = '\x{00F2}-\x{00F6}\x{00F8}'; + $pattern['oe'] = '\x{0153}'; + $pattern['s'] = '\x{0161}'; + $pattern['u'] = '\x{00F9}-\x{00FC}'; + $pattern['y'] = '\x{00FD}\x{00FF}'; + $pattern['z'] = '\x{017E}'; + + $pattern['ss'] = '\x{00DF}'; + return $pattern; + } + + public static function removeAccents($str, $clean = true) + { + $pattern = self::getAccentsPattern(); + if ($clean) { + $str = self::cleanUTF8($str); + $del = array('’' => ' ', '”' => ' ', '“' => ' ', '•' => ' ', '…' => ' ', '€' => ' ', + '–' => ' ', '‘' => ' '); + foreach ($del as $d => $p) { + $str = str_replace($d, $p, $str); + } + } + foreach ($pattern as $r => $p) { + $str = preg_replace('/[' . $p . ']/u', $r, $str); + } + + $from = 'o'; + $to = 'o'; + + $str = strtr($str, $from, $to); + + return $str; + } + + public static function keepOnlyLettersAndDigits($str) + { + return self::condenseWhite(preg_replace('|[^0-9A-Za-z]|ui', ' ', self::removeAccents($str))); + } + + public static function makeAccentInsensiblePattern($str) + { + $patterns = self::getAccentsPattern(); + $chars = preg_split('//ui', $str, -1, PREG_SPLIT_NO_EMPTY); + $pattern = '|'; + foreach ($chars as $char) { + if (isset($patterns[$char])) { + $pattern .= '['; + $pattern .= $char; + $pattern .= $patterns[$char]; + $pattern .= ']{1}'; + } else { + $pattern .= $char; + } + } + $pattern .= '|iu'; + return $pattern; + } + + public static function preg_areplace($search, $replace, $subject) + { + $pattern = self::makeAccentInsensiblePattern($search); + return preg_replace($pattern, $replace, $subject); + } + + public static function multiExplode($separator, $str, $limit = null) + { + $seps = array('§', '£', '¤', '#', '¨', '^', '%'); + foreach ($seps as $sep) { + if (stristr($str, $sep)) { + continue; + } + break; + } + + $str = preg_replace('|[' . preg_quote($separator, '-') . ']|', $sep, $str); + if (is_null($limit)) { + return explode($sep, $str); + } else { + return explode($sep, $str, $limit); + } + } + + public static function countWords($str) + { + return count(preg_split('|\s|', $str)); + } + + public static function explodeNewLines($str) + { + $str = trim($str); + if ($str === '') { + return []; + } + $str = self::condenseNewLines($str); + return preg_split('|\v|', $str); + } + + public static function substrWord($str, $words, $end = '', $wordsorig = null) + { + if (is_null($wordsorig)) { + $wordsorig = $words; + } + + $maxchars = $wordsorig * 6; + + $o = self::countWords($str); + if ($o <= $words) { + $res = $str; + $addend = false; + } else { + $e = self::multiExplode(" \n", $str, $words); + array_pop($e); + $res = implode(' ', $e); + $addend = true; + } + if (mb_strlen($res) > $maxchars) { + return self::substrWord($str, $words - 1, $end, $wordsorig); + } + + if ($addend) { + $res .= $end; + } + + return $res; + } + + public static function substrWordChars($str, $chars, $end = '') + { + if (strlen($str) <= $chars) { + return $str; + } + + $str = trim(substr($str, 0, $chars)); + $s = preg_split('|\s+|', $str); + array_pop($s); + return implode(' ', $s) . $end; + } + + public static function ucfirst($str, $lower = false) + { + if ($lower) { + $str = mb_strtolower($str); + } + $first = mb_substr($str, 0, 1); + $suite = mb_substr($str, 1); + return mb_strtoupper($first) . $suite; + } + + public static function removeNl($str) + { + $trans = array("\n" => ' ', "\r" => ' '); + $str = strtr($str, $trans); + return self::condenseWhite($str); + } + + public static function condenseWhite($str) + { + return preg_replace('|[\s]{2,100}|u', ' ', $str); + } + + public static function condenseNewLines($str) + { + $str = self::normalizeLines($str); + $str = preg_replace('|\n{2,100}|', "\n", $str); + return $str; + } + + public static function html2text($str) + { + $res = self::strip_tags($str); + $res = str_replace(' ', ' ', $res); + + return $res; + } + + public static function strip_tags($str, $allowed_tags = array(), $trim = false) + { + // return preg_replace('|\<.*\>|uU', '', $str); + // http://www.php.net/manual/fr/function.strip-tags.php#86463 + if (!is_array($allowed_tags)) { + $allowed_tags = !empty($allowed_tags) ? array($allowed_tags) : array(); + } + $tags = implode('|', $allowed_tags); + + if (empty($tags)) { + $tags = '[a-z]+'; + } + + preg_match_all('@@i', $str, $matches); + + $full_tags = $matches[0]; + $tag_names = $matches[1]; + + foreach ($full_tags as $i => $full_tag) { + if (!in_array($tag_names[$i], $allowed_tags)) { + if ($trim) { + unset($full_tags[$i]); + } else { + $str = str_replace($full_tag, '', $str); + } + } + } + + return $trim ? implode('', $full_tags) : $str; + } + + public static function str2URL($str, $replace = '-', $exclude_slashs = false, $exclude_dots = false) + { + if (is_object($str)) { + $str = json_encode($str); + } + $str = str_replace('&', '&', $str); + $str = str_replace(':', ' ', $str); + if (!$exclude_slashs) { + $str = str_replace('/', ' ', $str); + } + + $str = self::deaccent($str); + $str = preg_replace('/[^A-Za-z0-9_\s\'\:\/[\]-]/', '', $str); + + return self::tidyURL($str, true); + + } + + public static function cleanUTF8($str, $replace = '?') + { + while (($bad_index = self::utf8badFind($str)) !== false) { + $str = substr_replace($str, $replace, $bad_index, 1); + } + $str = str_replace('', $replace, $str); + $str = str_replace('', $replace, $str); + return $str; + } + + public static function getChar($code) + { + $code = trim($code, '&;'); + return html_entity_decode('&' . $code . ';', ENT_QUOTES, 'UTF-8'); + } + + public static function randText($length = 300) + { + $str = 'aeiouy azertyuiopqsdfghjklmwxcvbn eaiouaeiou '; + $list = str_split($str); + $nb = strlen($str) - 1; + $res = ''; + for ($i = 0; $i <= $length; $i++) { + $pos = rand(0, $nb); + $res .= $list[$pos]; + } + return $res; + } + + public static function splitWordsWithCase($str) + { + $non_word = '\x{0000}-\x{002F}\x{003A}-\x{0040}\x{005b}-\x{0060}\x{007B}-\x{007E}\x{00A0}-\x{00BF}\s'; + if (preg_match_all('/([^' . $non_word . ']{3,})/msu', html::clean($str), $match)) { + foreach ($match[1] as $i => $v) { + $match[1][$i] = $v; + } + return $match[1]; + } + return array(); + } + + public static function find_words_from_list($str, $list) + { + $words = array_unique(self::splitWordsWithCase($str));; + if (is_array($list)) { + $liste = $list; + } else { + $liste = array_unique(self::splitWords($list)); + } + + $l = array(); + foreach ($words as $ll) { + $lll = self::removeAccents($ll); + $lll = strtolower($lll); + $liste_real[$lll][] = $ll; + $l[] = $lll; + } + + $diff = array_intersect($liste, $l); + $res = array(); + if ($diff) { + foreach ($diff as $d) { + $res = array_merge($res, $liste_real[$d]); + } + return $res; + } + return false; + } + + public static function mb_str_split($string) + { + $stop = mb_strlen($string); + $result = array(); + + for ($idx = 0; $idx < $stop; $idx++) { + $result[] = mb_substr($string, $idx, 1); + } + + return $result; + } + + public static function strToArray($str) + { + return self::mb_str_split($str); + } + + public static function utf8ToUnicode($str) + { + $mState = 0; // cached expected number of octets after the current octet + // until the beginning of the next UTF8 character sequence + $mUcs4 = 0; // cached Unicode character + $mBytes = 1; // cached expected number of octets in the current sequence + + $out = array(); + + $len = strlen($str); + for ($i = 0; $i < $len; $i++) { + $in = ord($str{$i}); + if (0 == $mState) { + // When mState is zero we expect either a US-ASCII character or a + // multi-octet sequence. + if (0 == (0x80 & ($in))) { + // US-ASCII, pass straight through. + $out[] = $in; + $mBytes = 1; + } else if (0xC0 == (0xE0 & ($in))) { + // First octet of 2 octet sequence + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x1F) << 6; + $mState = 1; + $mBytes = 2; + } else if (0xE0 == (0xF0 & ($in))) { + // First octet of 3 octet sequence + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x0F) << 12; + $mState = 2; + $mBytes = 3; + } else if (0xF0 == (0xF8 & ($in))) { + // First octet of 4 octet sequence + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x07) << 18; + $mState = 3; + $mBytes = 4; + } else if (0xF8 == (0xFC & ($in))) { + /* First octet of 5 octet sequence. + * + * This is illegal because the encoded codepoint must be either + * (a) not the shortest form or + * (b) outside the Unicode range of 0-0x10FFFF. + * Rather than trying to resynchronize, we will carry on until the end + * of the sequence and let the later error handling code catch it. + */ + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 0x03) << 24; + $mState = 4; + $mBytes = 5; + } else if (0xFC == (0xFE & ($in))) { + // First octet of 6 octet sequence, see comments for 5 octet sequence. + $mUcs4 = ($in); + $mUcs4 = ($mUcs4 & 1) << 30; + $mState = 5; + $mBytes = 6; + } else { + /* Current octet is neither in the US-ASCII range nor a legal first + * octet of a multi-octet sequence. + */ + return false; + } + } else { + // When mState is non-zero, we expect a continuation of the multi-octet + // sequence + if (0x80 == (0xC0 & ($in))) { + // Legal continuation. + $shift = ($mState - 1) * 6; + $tmp = $in; + $tmp = ($tmp & 0x0000003F) << $shift; + $mUcs4 |= $tmp; + + if (0 == --$mState) { + /* End of the multi-octet sequence. mUcs4 now contains the final + * Unicode codepoint to be output + * + * Check for illegal sequences and codepoints. + */ + // From Unicode 3.1, non-shortest form is illegal + if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || + ((3 == $mBytes) && ($mUcs4 < 0x0800)) || + ((4 == $mBytes) && ($mUcs4 < 0x10000)) || + (4 < $mBytes) || + // From Unicode 3.2, surrogate characters are illegal + (($mUcs4 & 0xFFFFF800) == 0xD800) || + // Codepoints outside the Unicode range are illegal + ($mUcs4 > 0x10FFFF) + ) { + return false; + } + if (0xFEFF != $mUcs4) { + // BOM is legal but we don't want to output it + $out[] = $mUcs4; + } + // initialize UTF8 cache + $mState = 0; + $mUcs4 = 0; + $mBytes = 1; + } + } else { + /* ((0xC0 & (*in) != 0x80) && (mState != 0)) + * + * Incomplete multi-octet sequence. + */ + return false; + } + } + } + return $out; + } + + /** + * Takes an array of ints representing the Unicode characters and returns + * a UTF-8 string. Astral planes are supported ie. the ints in the + * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates + * are not allowed. + * + * Returns false if the input array contains ints that represent + * surrogates or are outside the Unicode range. + */ + public static function unicodeToUtf8($arr) + { + $dest = ''; + foreach ($arr as $src) { + if ($src < 0) { + return false; + } else if ($src <= 0x007f) { + $dest .= chr($src); + } else if ($src <= 0x07ff) { + $dest .= chr(0xc0 | ($src >> 6)); + $dest .= chr(0x80 | ($src & 0x003f)); + } else if ($src == 0xFEFF) { + // nop -- zap the BOM + } else if ($src >= 0xD800 && $src <= 0xDFFF) { + // found a surrogate + return false; + } else if ($src <= 0xffff) { + $dest .= chr(0xe0 | ($src >> 12)); + $dest .= chr(0x80 | (($src >> 6) & 0x003f)); + $dest .= chr(0x80 | ($src & 0x003f)); + } else if ($src <= 0x10ffff) { + $dest .= chr(0xf0 | ($src >> 18)); + $dest .= chr(0x80 | (($src >> 12) & 0x3f)); + $dest .= chr(0x80 | (($src >> 6) & 0x3f)); + $dest .= chr(0x80 | ($src & 0x3f)); + } else { + // out of range + return false; + } + } + return $dest; + } + + public static function uchr($n) + { + return self::unicodeToUtf8(array($n)); + } + + public static function uord($c) + { + $r = self::utf8ToUnicode($c); + return array_shift($r); + } + + public static function strcmp($s1, $s2, $ignoreCase = false, $ignoreAccents = false, $trim = false) + { + if ($trim !== false) { + $s1 = trim($s1, $trim); + $s2 = trim($s2, $trim); + } + if ($ignoreAccents) { + $s1 = self::removeAccents($s1); + $s2 = self::removeAccents($s2); + } + if ($ignoreCase) { + $s1 = mb_strtolower($s1); + $s2 = mb_strtolower($s2); + } + + return strcmp($s1, $s2); + } + + public static function removeNewLines($input) + { + $res = preg_replace("|\s+|", ' ', $input); + return $res; + } + + /** + * + * @param string $str + * @param boolean $compact + * @return array + */ + public static function splitLines($str, $compact = true) + { + $str = str_replace("\r\n", "\n", $str); + $str = str_replace("\r", "\n", $str); + $str = explode("\n", $str); + + if (!$compact) { + return $str; + } + + $res = array(); + foreach ($str as $s) { + $s = trim($s); + if ($s == '') { + continue; + } + $res[] = $s; + } + return $res; + } + + public static function parseUrl($url, $forceScheme = true) + { + $url = trim($url); + if (substr($url, 0, 2) == '//') { + $url = 'http:' . $url; + } + $res = parse_url($url); + if ($forceScheme && !isset($res['scheme'])) { + $url = 'http://' . $url; + $res = parse_url($url); + } + + if (isset($res['query'])) { + parse_str($res['query'], $tmp); + $res['query_params'] = $tmp; + } + + if (isset($res['path'])) { + $components = explode('/', trim($res['path'], '/')); + $filteredComponents = array(); + foreach ($components as $c) { + if ($c == '') { + continue; + } + $filteredComponents[] = $c; + } + $res['path_components'] = $filteredComponents; + } + return $res; + } + + public static function pluriel($nb, $singulier, $pluriel, $zero = false, $displayNb = true) + { + $nb = intval($nb); + $res = ''; + if ($displayNb) { + $res .= $nb . ' '; + } + if ($nb == 0 && $zero) { + return $zero; + } + if ($nb <= 1) { + $res .= $singulier; + } else { + $res .= $pluriel; + } + return $res; + } + + public static function normalizeLines($text, $os = 'nix') + { + $text = str_replace("\r\n", "\n", $text); + $text = str_replace("\r", "\n", $text); + if ($os == 'win') { + return str_replace("\n", "\r\n", $text); + } + return $text; + } + + public static function underscoreToCamelCase($str, $upperFirst = false) + { + return Str::camel($str); + } + + public static function camelCaseToUnderscore($str) + { + return Str::snake($str); + } + + // Stops orphans in HTML by replacing the last space with a   + public static function preventOrphans($str) + { + + $find = ' '; // What to search for + $replace = ' '; // What to replace it with + + $last_space = strrpos($str, $find); // Find last occurrence in string + + if ($last_space !== false) { + $str = substr_replace($str, $replace, $last_space, strlen($find)); + } + + // Also replace punctuation that has spaces before it (eg. in French) + $punctuations = array(' :', ' !', ' ?', '« ', ' »'); + $replacements = array("{$replace}:", "{$replace}!", "{$replace}?", "«{$replace}", "{$replace}»"); + $str = str_replace($punctuations, $replacements, $str); + + return $str; + } + + /** + * Check email address + * + * Returns true if $email is a valid email address. + * + * @param string $email Email string + * @return boolean + * @link http://www.iamcal.com/publish/articles/php/parsing_email/ + * + * @copyright Cal Henderson + * @license http://creativecommons.org/licenses/by-sa/2.5/ CC-BY-SA + */ + public static function isEmail($email) + { + $qtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'; + $dtext = '[^\\x0d\\x5b-\\x5d\\x80-\\xff]'; + $atom = '[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+'; + $quoted_pair = '\\x5c[\\x00-\\x7f]'; + $domain_literal = "\\x5b($dtext|$quoted_pair)*\\x5d"; + $quoted_string = "\\x22($qtext|$quoted_pair)*\\x22"; + $domain_ref = $atom; + $sub_domain = "($domain_ref|$domain_literal)"; + $word = "($atom|$quoted_string)"; + $domain = "$sub_domain(\\x2e$sub_domain)*"; + $local_part = "$word(\\x2e$word)*"; + $addr_spec = "$local_part\\x40$domain"; + + return (boolean)preg_match("!^$addr_spec$!", $email); + } + + /** + * Accents replacement + * + * Replaces some occidental accentuated characters by their ASCII + * representation. + * + * @param string $str String to deaccent + * @return string + */ + public static function deaccent($str) + { + $pattern['A'] = '\x{00C0}-\x{00C5}'; + $pattern['AE'] = '\x{00C6}'; + $pattern['C'] = '\x{00C7}'; + $pattern['D'] = '\x{00D0}'; + $pattern['E'] = '\x{00C8}-\x{00CB}'; + $pattern['I'] = '\x{00CC}-\x{00CF}'; + $pattern['N'] = '\x{00D1}'; + $pattern['O'] = '\x{00D2}-\x{00D6}\x{00D8}'; + $pattern['OE'] = '\x{0152}'; + $pattern['S'] = '\x{0160}'; + $pattern['U'] = '\x{00D9}-\x{00DC}'; + $pattern['Y'] = '\x{00DD}'; + $pattern['Z'] = '\x{017D}'; + + $pattern['a'] = '\x{00E0}-\x{00E5}'; + $pattern['ae'] = '\x{00E6}'; + $pattern['c'] = '\x{00E7}'; + $pattern['d'] = '\x{00F0}'; + $pattern['e'] = '\x{00E8}-\x{00EB}'; + $pattern['i'] = '\x{00EC}-\x{00EF}'; + $pattern['n'] = '\x{00F1}'; + $pattern['o'] = '\x{00F2}-\x{00F6}\x{00F8}'; + $pattern['oe'] = '\x{0153}'; + $pattern['s'] = '\x{0161}'; + $pattern['u'] = '\x{00F9}-\x{00FC}'; + $pattern['y'] = '\x{00FD}\x{00FF}'; + $pattern['z'] = '\x{017E}'; + + $pattern['ss'] = '\x{00DF}'; + + foreach ($pattern as $r => $p) { + $str = preg_replace('/[' . $p . ']/u', $r, $str); + } + + return $str; + } + + /** + * URL cleanup + * + * @param string $str URL to tidy + * @param boolean $keep_slashes Keep slashes in URL + * @param boolean $keep_spaces Keep spaces in URL + * @return string + */ + public static function tidyURL($str, $keep_slashes = true, $keep_spaces = false) + { + $str = strip_tags($str); + $str = str_replace(array('?', '&', '#', '=', '+', '<', '>', '"', '%'), '', $str); + $str = str_replace("'", ' ', $str); + $str = preg_replace('/[\s]+/u', ' ', trim($str)); + + if (!$keep_slashes) { + $str = str_replace('/', '-', $str); + } + + if (!$keep_spaces) { + $str = str_replace(' ', '-', $str); + } + + $str = preg_replace('/[-]+/', '-', $str); + + # Remove path changes in URL + $str = preg_replace('%^/%', '', $str); + $str = preg_replace('%\.+/%', '', $str); + + return $str; + } + + /** + * Cut string + * + * Returns a cuted string on spaced at given length $l. + * + * @param string $str String to cut + * @param integer $l Length to keep + * @return string + */ + public static function cutString($str, $l) + { + $s = preg_split('/([\s]+)/u', $str, -1, PREG_SPLIT_DELIM_CAPTURE); + + $res = ''; + $L = 0; + + if (mb_strlen($s[0]) >= $l) { + return mb_substr($s[0], 0, $l); + } + + foreach ($s as $v) { + $L = $L + mb_strlen($v); + + if ($L > $l) { + break; + } else { + $res .= $v; + } + } + + return trim($res); + } + + /** + * Split words + * + * Returns an array of words from a given string. + * + * @param string $str Words to split + * @return array + */ + public static function splitWords($str, $minChar = 3) + { + $non_word = '\x{0000}-\x{002F}\x{003A}-\x{0040}\x{005b}-\x{0060}\x{007B}-\x{007E}\x{00A0}-\x{00BF}\s'; + if (preg_match_all('/([^' . $non_word . ']{' . $minChar . ',})/msu', html::clean($str), $match)) { + foreach ($match[1] as $i => $v) { + $match[1][$i] = mb_strtolower($v); + } + return $match[1]; + } + return array(); + } + + /** + * Encoding detection + * + * Returns the encoding (in lowercase) of given $str. + * + * @param string $str String + * @return string + */ + public static function detectEncoding($str) + { + return strtolower(mb_detect_encoding($str . ' ', + 'UTF-8,ISO-8859-1,ISO-8859-2,ISO-8859-3,' . + 'ISO-8859-4,ISO-8859-5,ISO-8859-6,ISO-8859-7,ISO-8859-8,' . + 'ISO-8859-9,ISO-8859-10,ISO-8859-13,ISO-8859-14,ISO-8859-15')); + } + + /** + * Find bad UTF8 tokens + * + * Locates the first bad byte in a UTF-8 string returning it's + * byte index in the string + * PCRE Pattern to locate bad bytes in a UTF-8 string + * Comes from W3 FAQ: Multilingual Forms + * Note: modified to include full ASCII range including control chars + * + * @param string $str String to search + * @return integer|false + * @link http://phputf8.sourceforge.net + * + * @copyright Harry Fuecks + * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html GNU LGPL 2.1 + */ + public static function utf8badFind($str) + { + $UTF8_BAD = + '([\x00-\x7F]' . # ASCII (including control chars) + '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte + '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs + '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte + '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates + '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3 + '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15 + '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16 + '|(.{1}))'; # invalid byte + $pos = 0; + $badList = array(); + + while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) { + $bytes = strlen($matches[0]); + if (isset($matches[2])) { + return $pos; + } + $pos += $bytes; + $str = substr($str, $bytes); + } + return false; + } + + + /** + * BOM removal + * + * Removes BOM from the begining of a string if present. + * + * @param string $str String to clean + * @return string + */ + public static function removeBOM($str) + { + if (substr_count($str, '')) { + return str_replace('', '', $str); + } + + return $str; + } + + /** + * Quoted printable conversion + * + * Encodes given str to quoted printable + * + * @param string $str String to encode + * @return string + */ + public static function QPEncode($str) + { + $res = ''; + + foreach (preg_split("/\r?\n/msu", $str) as $line) { + $l = ''; + preg_match_all('/./', $line, $m); + + foreach ($m[0] as $c) { + $a = ord($c); + + if ($a < 32 || $a == 61 || $a > 126) { + $c = sprintf('=%02X', $a); + } + + $l .= $c; + } + + $res .= $l . "\r\n"; + } + return $res; + } } \ No newline at end of file -- 2.39.5