. * * ---------------------------------------------------------------------- * * Class Name: Detect Arabic String Character Set * * Filename: CharsetD.php * * Original Author(s): Khaled Al-Sham'aa * * Purpose: This class will return Arabic character set that used for * a given Arabic string passing into this class, those available * character sets that can be detected by this class includes * the most popular three: Windows-1256, ISO 8859-6, and UTF-8. * * ---------------------------------------------------------------------- * * Detect Arabic String Character Set * * The last step of the Information Retrieval process is to display the found * documents to the user. However, some difficulties might occur at that point. * English texts are usually written in the ASCII standard. Unlike the English * language, many languages have different character sets, and do not have one * standard. This plurality of standards causes problems, especially in a web * environment. * * This PHP class will return Arabic character set that used for a given * Arabic string passing into this class, those available character sets that can * be detected by this class includes the most popular three: Windows-1256, * ISO 8859-6, and UTF-8. * * Example: * * include('./I18N/Arabic.php'); * $obj = new I18N_Arabic('CharsetD'); * * $charset = $obj->getCharset($text); * * * @category I18N * @package I18N_Arabic * @author Khaled Al-Sham'aa * @copyright 2006-2016 Khaled Al-Sham'aa * * @license LGPL * @link http://www.ar-php.org */ /** * This PHP class detect Arabic string character set * * @category I18N * @package I18N_Arabic * @author Khaled Al-Sham'aa * @copyright 2006-2016 Khaled Al-Sham'aa * * @license LGPL * @link http://www.ar-php.org */ class I18N_Arabic_CharsetD { /** * Loads initialize values * * @ignore */ public function __construct() { } /** * Count number of hits for the most frequented letters in Arabic language * (Alef, Lam and Yaa), then calculate association ratio with each of * possible character set (UTF-8, Windows-1256 and ISO-8859-6) * * @param String $string Arabic string in unknown format * * @return Array Character set as key and string association ratio as value * @author Khaled Al-Sham'aa */ public function guess($string) { // The most frequent Arabic letters are Alef, Lam, and Yeh $charset['windows-1256'] = substr_count($string, chr(199)); $charset['windows-1256'] += substr_count($string, chr(225)); $charset['windows-1256'] += substr_count($string, chr(237)); $charset['iso-8859-6'] = substr_count($string, chr(199)); $charset['iso-8859-6'] += substr_count($string, chr(228)); $charset['iso-8859-6'] += substr_count($string, chr(234)); $charset['utf-8'] = substr_count($string, chr(216).chr(167)); $charset['utf-8'] += substr_count($string, chr(217).chr(132)); $charset['utf-8'] += substr_count($string, chr(217).chr(138)); $total = $charset['windows-1256'] + $charset['iso-8859-6'] + $charset['utf-8'] + 1; $charset['windows-1256'] = round($charset['windows-1256'] * 100 / $total); $charset['iso-8859-6'] = round($charset['iso-8859-6'] * 100 / $total); $charset['utf-8'] = round($charset['utf-8'] * 100 / $total); return $charset; } /** * Find the most possible character set for given Arabic string in unknown * format * * @param String $string Arabic string in unknown format * * @return String The most possible character set for given Arabic string in * unknown format[utf-8|windows-1256|iso-8859-6] * @author Khaled Al-Sham'aa */ public function getCharset($string) { if (preg_match('//sim', $string, $matches)) { $value = $matches[1]; } else { $charset = $this->guess($string); arsort($charset); $value = key($charset); } return $value; } }