1 <?php
2 3 4 5 6 7 8 9
10
11 defined('JPATH_PLATFORM') or die;
12
13 14 15 16 17 18 19 20
21 class JLanguageStemmerPorteren extends JLanguageStemmer
22 {
23 24 25 26 27 28
29 private static $_regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
30
31 32 33 34 35
36 private static $_regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
37
38 39 40 41 42 43 44 45 46 47
48 public function stem($token, $lang)
49 {
50
51 if (strlen($token) <= 2)
52 {
53 return $token;
54 }
55
56
57 if ($lang !== 'en')
58 {
59 return $token;
60 }
61
62
63 if (!isset($this->cache[$lang][$token]))
64 {
65
66 $result = $token;
67 $result = self::_step1ab($result);
68 $result = self::_step1c($result);
69 $result = self::_step2($result);
70 $result = self::_step3($result);
71 $result = self::_step4($result);
72 $result = self::_step5($result);
73
74
75 $this->cache[$lang][$token] = $result;
76 }
77
78 return $this->cache[$lang][$token];
79 }
80
81 82 83 84 85 86 87 88 89
90 private static function _step1ab($word)
91 {
92
93 if (substr($word, -1) == 's')
94 {
95 self::_replace($word, 'sses', 'ss')
96 or self::_replace($word, 'ies', 'i')
97 or self::_replace($word, 'ss', 'ss')
98 or self::_replace($word, 's', '');
99 }
100
101
102 if (substr($word, -2, 1) != 'e' or !self::_replace($word, 'eed', 'ee', 0))
103 {
104
105 $v = self::$_regex_vowel;
106
107
108
109 if (preg_match("#$v+#", substr($word, 0, -3)) && self::_replace($word, 'ing', '')
110 or preg_match("#$v+#", substr($word, 0, -2)) && self::_replace($word, 'ed', ''))
111 {
112
113 if (!self::_replace($word, 'at', 'ate') and !self::_replace($word, 'bl', 'ble') and !self::_replace($word, 'iz', 'ize'))
114 {
115
116 if (self::_doubleConsonant($word) and substr($word, -2) != 'll' and substr($word, -2) != 'ss' and substr($word, -2) != 'zz')
117 {
118 $word = substr($word, 0, -1);
119 }
120 elseif (self::_m($word) == 1 and self::_cvc($word))
121 {
122 $word .= 'e';
123 }
124 }
125 }
126 }
127
128 return $word;
129 }
130
131 132 133 134 135 136 137 138 139
140 private static function _step1c($word)
141 {
142 $v = self::$_regex_vowel;
143
144 if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
145 {
146 self::_replace($word, 'y', 'i');
147 }
148
149 return $word;
150 }
151
152 153 154 155 156 157 158 159 160
161 private static function _step2($word)
162 {
163 switch (substr($word, -2, 1))
164 {
165 case 'a':
166 self::_replace($word, 'ational', 'ate', 0)
167 or self::_replace($word, 'tional', 'tion', 0);
168 break;
169 case 'c':
170 self::_replace($word, 'enci', 'ence', 0)
171 or self::_replace($word, 'anci', 'ance', 0);
172 break;
173 case 'e':
174 self::_replace($word, 'izer', 'ize', 0);
175 break;
176 case 'g':
177 self::_replace($word, 'logi', 'log', 0);
178 break;
179 case 'l':
180 self::_replace($word, 'entli', 'ent', 0)
181 or self::_replace($word, 'ousli', 'ous', 0)
182 or self::_replace($word, 'alli', 'al', 0)
183 or self::_replace($word, 'bli', 'ble', 0)
184 or self::_replace($word, 'eli', 'e', 0);
185 break;
186 case 'o':
187 self::_replace($word, 'ization', 'ize', 0)
188 or self::_replace($word, 'ation', 'ate', 0)
189 or self::_replace($word, 'ator', 'ate', 0);
190 break;
191 case 's':
192 self::_replace($word, 'iveness', 'ive', 0)
193 or self::_replace($word, 'fulness', 'ful', 0)
194 or self::_replace($word, 'ousness', 'ous', 0)
195 or self::_replace($word, 'alism', 'al', 0);
196 break;
197 case 't':
198 self::_replace($word, 'biliti', 'ble', 0)
199 or self::_replace($word, 'aliti', 'al', 0)
200 or self::_replace($word, 'iviti', 'ive', 0);
201 break;
202 }
203
204 return $word;
205 }
206
207 208 209 210 211 212 213 214 215
216 private static function _step3($word)
217 {
218 switch (substr($word, -2, 1))
219 {
220 case 'a':
221 self::_replace($word, 'ical', 'ic', 0);
222 break;
223 case 's':
224 self::_replace($word, 'ness', '', 0);
225 break;
226 case 't':
227 self::_replace($word, 'icate', 'ic', 0)
228 or self::_replace($word, 'iciti', 'ic', 0);
229 break;
230 case 'u':
231 self::_replace($word, 'ful', '', 0);
232 break;
233 case 'v':
234 self::_replace($word, 'ative', '', 0);
235 break;
236 case 'z':
237 self::_replace($word, 'alize', 'al', 0);
238 break;
239 }
240
241 return $word;
242 }
243
244 245 246 247 248 249 250 251 252
253 private static function _step4($word)
254 {
255 switch (substr($word, -2, 1))
256 {
257 case 'a':
258 self::_replace($word, 'al', '', 1);
259 break;
260 case 'c':
261 self::_replace($word, 'ance', '', 1)
262 or self::_replace($word, 'ence', '', 1);
263 break;
264 case 'e':
265 self::_replace($word, 'er', '', 1);
266 break;
267 case 'i':
268 self::_replace($word, 'ic', '', 1);
269 break;
270 case 'l':
271 self::_replace($word, 'able', '', 1)
272 or self::_replace($word, 'ible', '', 1);
273 break;
274 case 'n':
275 self::_replace($word, 'ant', '', 1)
276 or self::_replace($word, 'ement', '', 1)
277 or self::_replace($word, 'ment', '', 1)
278 or self::_replace($word, 'ent', '', 1);
279 break;
280 case 'o':
281 if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion')
282 {
283 self::_replace($word, 'ion', '', 1);
284 }
285 else
286 {
287 self::_replace($word, 'ou', '', 1);
288 }
289 break;
290 case 's':
291 self::_replace($word, 'ism', '', 1);
292 break;
293 case 't':
294 self::_replace($word, 'ate', '', 1)
295 or self::_replace($word, 'iti', '', 1);
296 break;
297 case 'u':
298 self::_replace($word, 'ous', '', 1);
299 break;
300 case 'v':
301 self::_replace($word, 'ive', '', 1);
302 break;
303 case 'z':
304 self::_replace($word, 'ize', '', 1);
305 break;
306 }
307
308 return $word;
309 }
310
311 312 313 314 315 316 317 318 319
320 private static function _step5($word)
321 {
322
323 if (substr($word, -1) == 'e')
324 {
325 if (self::_m(substr($word, 0, -1)) > 1)
326 {
327 self::_replace($word, 'e', '');
328 }
329 elseif (self::_m(substr($word, 0, -1)) == 1)
330 {
331 if (!self::_cvc(substr($word, 0, -1)))
332 {
333 self::_replace($word, 'e', '');
334 }
335 }
336 }
337
338
339 if (self::_m($word) > 1 and self::_doubleConsonant($word) and substr($word, -1) == 'l')
340 {
341 $word = substr($word, 0, -1);
342 }
343
344 return $word;
345 }
346
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
362 private static function _replace(&$str, $check, $repl, $m = null)
363 {
364 $len = 0 - strlen($check);
365
366 if (substr($str, $len) == $check)
367 {
368 $substr = substr($str, 0, $len);
369
370 if (is_null($m) or self::_m($substr) > $m)
371 {
372 $str = $substr . $repl;
373 }
374
375 return true;
376 }
377
378 return false;
379 }
380
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
397 private static function _m($str)
398 {
399 $c = self::$_regex_consonant;
400 $v = self::$_regex_vowel;
401
402 $str = preg_replace("#^$c+#", '', $str);
403 $str = preg_replace("#$v+$#", '', $str);
404
405 preg_match_all("#($v+$c+)#", $str, $matches);
406
407 return count($matches[1]);
408 }
409
410 411 412 413 414 415 416 417 418 419
420 private static function _doubleConsonant($str)
421 {
422 $c = self::$_regex_consonant;
423
424 return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
425 }
426
427 428 429 430 431 432 433 434 435
436 private static function _cvc($str)
437 {
438 $c = self::$_regex_consonant;
439 $v = self::$_regex_vowel;
440
441 $result = preg_match("#($c$v$c)$#", $str, $matches)
442 and strlen($matches[1]) == 3
443 and $matches[1]{2} != 'w'
444 and $matches[1]{2} != 'x'
445 and $matches[1]{2} != 'y';
446
447 return $result;
448 }
449 }
450