File joomla/language/stemmer/porteren.php | Joomla! Framework TM

  1 <?php
  2 /**
  3  * @package     Joomla.Platform
  4  * @subpackage  Language
  5  *
  6  * @copyright   Copyright (C) 2005 - 2017 Open Source Matters, Inc. All rights reserved.
  7  * @copyright   Copyright (C) 2005 Richard Heyes (http://www.phpguru.org/). All rights reserved.
  8  * @license     GNU General Public License version 2 or later; see LICENSE
  9  */
 10 
 11 defined('JPATH_PLATFORM') or die;
 12 
 13 /**
 14  * Porter English stemmer class.
 15  *
 16  * This class was adapted from one written by Richard Heyes.
 17  * See copyright and link information above.
 18  *
 19  * @since  12.1
 20  */
 21 class JLanguageStemmerPorteren extends JLanguageStemmer
 22 {
 23     /**
 24      * Regex for matching a consonant.
 25      *
 26      * @var    string
 27      * @since  12.1
 28      */
 29     private static $_regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
 30 
 31     /**
 32      * Regex for matching a vowel
 33      * @var    string
 34      * @since  12.1
 35      */
 36     private static $_regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
 37 
 38     /**
 39      * Method to stem a token and return the root.
 40      *
 41      * @param   string  $token  The token to stem.
 42      * @param   string  $lang   The language of the token.
 43      *
 44      * @return  string  The root token.
 45      *
 46      * @since   12.1
 47      */
 48     public function stem($token, $lang)
 49     {
 50         // Check if the token is long enough to merit stemming.
 51         if (strlen($token) <= 2)
 52         {
 53             return $token;
 54         }
 55 
 56         // Check if the language is English or All.
 57         if ($lang !== 'en')
 58         {
 59             return $token;
 60         }
 61 
 62         // Stem the token if it is not in the cache.
 63         if (!isset($this->cache[$lang][$token]))
 64         {
 65             // Stem the token.
 66             $result = $token;
 67             $result = self::_step1ab($result);
 68             $result = self::_step1c($result);
 69             $result = self::_step2($result);
 70             $result = self::_step3($result);
 71             $result = self::_step4($result);
 72             $result = self::_step5($result);
 73 
 74             // Add the token to the cache.
 75             $this->cache[$lang][$token] = $result;
 76         }
 77 
 78         return $this->cache[$lang][$token];
 79     }
 80 
 81     /**
 82      * Step 1
 83      *
 84      * @param   string  $word  The token to stem.
 85      *
 86      * @return  string
 87      *
 88      * @since   12.1
 89      */
 90     private static function _step1ab($word)
 91     {
 92         // Part a
 93         if (substr($word, -1) == 's')
 94         {
 95                 self::_replace($word, 'sses', 'ss')
 96             or self::_replace($word, 'ies', 'i')
 97             or self::_replace($word, 'ss', 'ss')
 98             or self::_replace($word, 's', '');
 99         }
100 
101         // Part b
102         if (substr($word, -2, 1) != 'e' or !self::_replace($word, 'eed', 'ee', 0))
103         {
104             // First rule
105             $v = self::$_regex_vowel;
106 
107             // Check ing and ed
108             // Note use of && and OR, for precedence reasons
109             if (preg_match("#$v+#", substr($word, 0, -3)) && self::_replace($word, 'ing', '')
110                 or preg_match("#$v+#", substr($word, 0, -2)) && self::_replace($word, 'ed', ''))
111             {
112                 // If one of above two test successful
113                 if (!self::_replace($word, 'at', 'ate') and !self::_replace($word, 'bl', 'ble') and !self::_replace($word, 'iz', 'ize'))
114                 {
115                     // Double consonant ending
116                     if (self::_doubleConsonant($word) and substr($word, -2) != 'll' and substr($word, -2) != 'ss' and substr($word, -2) != 'zz')
117                     {
118                         $word = substr($word, 0, -1);
119                     }
120                     elseif (self::_m($word) == 1 and self::_cvc($word))
121                     {
122                         $word .= 'e';
123                     }
124                 }
125             }
126         }
127 
128         return $word;
129     }
130 
131     /**
132      * Step 1c
133      *
134      * @param   string  $word  The token to stem.
135      *
136      * @return  string
137      *
138      * @since   12.1
139      */
140     private static function _step1c($word)
141     {
142         $v = self::$_regex_vowel;
143 
144         if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
145         {
146             self::_replace($word, 'y', 'i');
147         }
148 
149         return $word;
150     }
151 
152     /**
153      * Step 2
154      *
155      * @param   string  $word  The token to stem.
156      *
157      * @return  string
158      *
159      * @since   12.1
160      */
161     private static function _step2($word)
162     {
163         switch (substr($word, -2, 1))
164         {
165             case 'a':
166                     self::_replace($word, 'ational', 'ate', 0)
167                 or self::_replace($word, 'tional', 'tion', 0);
168                 break;
169             case 'c':
170                     self::_replace($word, 'enci', 'ence', 0)
171                 or self::_replace($word, 'anci', 'ance', 0);
172                 break;
173             case 'e':
174                 self::_replace($word, 'izer', 'ize', 0);
175                 break;
176             case 'g':
177                 self::_replace($word, 'logi', 'log', 0);
178                 break;
179             case 'l':
180                     self::_replace($word, 'entli', 'ent', 0)
181                 or self::_replace($word, 'ousli', 'ous', 0)
182                 or self::_replace($word, 'alli', 'al', 0)
183                 or self::_replace($word, 'bli', 'ble', 0)
184                 or self::_replace($word, 'eli', 'e', 0);
185                 break;
186             case 'o':
187                     self::_replace($word, 'ization', 'ize', 0)
188                 or self::_replace($word, 'ation', 'ate', 0)
189                 or self::_replace($word, 'ator', 'ate', 0);
190                 break;
191             case 's':
192                     self::_replace($word, 'iveness', 'ive', 0)
193                 or self::_replace($word, 'fulness', 'ful', 0)
194                 or self::_replace($word, 'ousness', 'ous', 0)
195                 or self::_replace($word, 'alism', 'al', 0);
196                 break;
197             case 't':
198                     self::_replace($word, 'biliti', 'ble', 0)
199                 or self::_replace($word, 'aliti', 'al', 0)
200                 or self::_replace($word, 'iviti', 'ive', 0);
201                 break;
202         }
203 
204         return $word;
205     }
206 
207     /**
208      * Step 3
209      *
210      * @param   string  $word  The token to stem.
211      *
212      * @return  string
213      *
214      * @since   12.1
215      */
216     private static function _step3($word)
217     {
218         switch (substr($word, -2, 1))
219         {
220             case 'a':
221                 self::_replace($word, 'ical', 'ic', 0);
222                 break;
223             case 's':
224                 self::_replace($word, 'ness', '', 0);
225                 break;
226             case 't':
227                     self::_replace($word, 'icate', 'ic', 0)
228                 or self::_replace($word, 'iciti', 'ic', 0);
229                 break;
230             case 'u':
231                 self::_replace($word, 'ful', '', 0);
232                 break;
233             case 'v':
234                 self::_replace($word, 'ative', '', 0);
235                 break;
236             case 'z':
237                 self::_replace($word, 'alize', 'al', 0);
238                 break;
239         }
240 
241         return $word;
242     }
243 
244     /**
245      * Step 4
246      *
247      * @param   string  $word  The token to stem.
248      *
249      * @return  string
250      *
251      * @since   12.1
252      */
253     private static function _step4($word)
254     {
255         switch (substr($word, -2, 1))
256         {
257             case 'a':
258                 self::_replace($word, 'al', '', 1);
259                 break;
260             case 'c':
261                     self::_replace($word, 'ance', '', 1)
262                 or self::_replace($word, 'ence', '', 1);
263                 break;
264             case 'e':
265                 self::_replace($word, 'er', '', 1);
266                 break;
267             case 'i':
268                 self::_replace($word, 'ic', '', 1);
269                 break;
270             case 'l':
271                     self::_replace($word, 'able', '', 1)
272                 or self::_replace($word, 'ible', '', 1);
273                 break;
274             case 'n':
275                     self::_replace($word, 'ant', '', 1)
276                 or self::_replace($word, 'ement', '', 1)
277                 or self::_replace($word, 'ment', '', 1)
278                 or self::_replace($word, 'ent', '', 1);
279                 break;
280             case 'o':
281                 if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion')
282                 {
283                     self::_replace($word, 'ion', '', 1);
284                 }
285                 else
286                 {
287                     self::_replace($word, 'ou', '', 1);
288                 }
289                 break;
290             case 's':
291                 self::_replace($word, 'ism', '', 1);
292                 break;
293             case 't':
294                     self::_replace($word, 'ate', '', 1)
295                 or self::_replace($word, 'iti', '', 1);
296                 break;
297             case 'u':
298                 self::_replace($word, 'ous', '', 1);
299                 break;
300             case 'v':
301                 self::_replace($word, 'ive', '', 1);
302                 break;
303             case 'z':
304                 self::_replace($word, 'ize', '', 1);
305                 break;
306         }
307 
308         return $word;
309     }
310 
311     /**
312      * Step 5
313      *
314      * @param   string  $word  The token to stem.
315      *
316      * @return  string
317      *
318      * @since   12.1
319      */
320     private static function _step5($word)
321     {
322         // Part a
323         if (substr($word, -1) == 'e')
324         {
325             if (self::_m(substr($word, 0, -1)) > 1)
326             {
327                 self::_replace($word, 'e', '');
328             }
329             elseif (self::_m(substr($word, 0, -1)) == 1)
330             {
331                 if (!self::_cvc(substr($word, 0, -1)))
332                 {
333                     self::_replace($word, 'e', '');
334                 }
335             }
336         }
337 
338         // Part b
339         if (self::_m($word) > 1 and self::_doubleConsonant($word) and substr($word, -1) == 'l')
340         {
341             $word = substr($word, 0, -1);
342         }
343 
344         return $word;
345     }
346 
347     /**
348      * Replaces the first string with the second, at the end of the string. If third
349      * arg is given, then the preceding string must match that m count at least.
350      *
351      * @param   string   &$str   String to check
352      * @param   string   $check  Ending to check for
353      * @param   string   $repl   Replacement string
354      * @param   integer  $m      Optional minimum number of m() to meet
355      *
356      * @return  boolean  Whether the $check string was at the end
357      *                   of the $str string. True does not necessarily mean
358      *                   that it was replaced.
359      *
360      * @since   12.1
361      */
362     private static function _replace(&$str, $check, $repl, $m = null)
363     {
364         $len = 0 - strlen($check);
365 
366         if (substr($str, $len) == $check)
367         {
368             $substr = substr($str, 0, $len);
369 
370             if (is_null($m) or self::_m($substr) > $m)
371             {
372                 $str = $substr . $repl;
373             }
374 
375             return true;
376         }
377 
378         return false;
379     }
380 
381     /**
382      * m() measures the number of consonant sequences in $str. if c is
383      * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
384      * presence,
385      *
386      * <c><v>       gives 0
387      * <c>vc<v>     gives 1
388      * <c>vcvc<v>   gives 2
389      * <c>vcvcvc<v> gives 3
390      *
391      * @param   string  $str  The string to return the m count for
392      *
393      * @return  integer  The m count
394      *
395      * @since   12.1
396      */
397     private static function _m($str)
398     {
399         $c = self::$_regex_consonant;
400         $v = self::$_regex_vowel;
401 
402         $str = preg_replace("#^$c+#", '', $str);
403         $str = preg_replace("#$v+$#", '', $str);
404 
405         preg_match_all("#($v+$c+)#", $str, $matches);
406 
407         return count($matches[1]);
408     }
409 
410     /**
411      * Returns true/false as to whether the given string contains two
412      * of the same consonant next to each other at the end of the string.
413      *
414      * @param   string  $str  String to check
415      *
416      * @return  boolean  Result
417      *
418      * @since   12.1
419      */
420     private static function _doubleConsonant($str)
421     {
422         $c = self::$_regex_consonant;
423 
424         return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
425     }
426 
427     /**
428      * Checks for ending CVC sequence where second C is not W, X or Y
429      *
430      * @param   string  $str  String to check
431      *
432      * @return  boolean  Result
433      *
434      * @since   12.1
435      */
436     private static function _cvc($str)
437     {
438         $c = self::$_regex_consonant;
439         $v = self::$_regex_vowel;
440 
441         $result = preg_match("#($c$v$c)$#", $str, $matches)
442             and strlen($matches[1]) == 3
443             and $matches[1]{2} != 'w'
444             and $matches[1]{2} != 'x'
445             and $matches[1]{2} != 'y';
446 
447         return $result;
448     }
449 }
450
Namespaces

Classes

Interfaces

Exceptions

Constants

Functions