@@ -143,7 +143,7 @@ char[] caseFoldLower(char[] word, int length) {
143143
144144 // Special prefix handling for Catalan, French, Italian:
145145 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
146- char [] capitalizeAfterApostrophe (char [] word , int length ) {
146+ static char [] capitalizeAfterApostrophe (char [] word , int length ) {
147147 for (int i = 1 ; i < length - 1 ; i ++) {
148148 if (word [i ] == '\'' ) {
149149 char next = word [i + 1 ];
@@ -175,11 +175,12 @@ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
175175 if (Dictionary .hasFlag (wordFlags , dictionary .onlyincompound )) {
176176 continue ;
177177 }
178- stems .add (newStem (word , length , forms , i ));
178+ stems .add (newStem (word , 0 , length , forms , i ));
179179 }
180180 }
181181 try {
182- stems .addAll (stem (word , length , -1 , (char ) 0 , -1 , 0 , true , true , false , false , caseVariant ));
182+ stems .addAll (
183+ stem (word , 0 , length , -1 , (char ) 0 , -1 , 0 , true , true , false , false , caseVariant ));
183184 } catch (IOException bogus ) {
184185 throw new RuntimeException (bogus );
185186 }
@@ -214,7 +215,7 @@ public List<CharsRef> uniqueStems(char[] word, int length) {
214215 return deduped ;
215216 }
216217
217- private CharsRef newStem (char [] buffer , int length , IntsRef forms , int formID ) {
218+ private CharsRef newStem (char [] buffer , int offset , int length , IntsRef forms , int formID ) {
218219 final String exception ;
219220 if (dictionary .hasStemExceptions ) {
220221 int exceptionID = forms .ints [forms .offset + formID + 1 ];
@@ -232,7 +233,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
232233 if (exception != null ) {
233234 scratchSegment .append (exception );
234235 } else {
235- scratchSegment .append (buffer , 0 , length );
236+ scratchSegment .append (buffer , offset , length );
236237 }
237238 try {
238239 Dictionary .applyMappings (dictionary .oconv , scratchSegment );
@@ -246,7 +247,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
246247 if (exception != null ) {
247248 return new CharsRef (exception );
248249 } else {
249- return new CharsRef (buffer , 0 , length );
250+ return new CharsRef (buffer , offset , length );
250251 }
251252 }
252253 }
@@ -284,6 +285,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
284285 */
285286 private List <CharsRef > stem (
286287 char [] word ,
288+ int offset ,
287289 int length ,
288290 int previous ,
289291 char prevFlag ,
@@ -308,7 +310,7 @@ private List<CharsRef> stem(
308310 int limit = dictionary .fullStrip ? length + 1 : length ;
309311 for (int i = 0 ; i < limit ; i ++) {
310312 if (i > 0 ) {
311- int ch = word [i - 1 ];
313+ char ch = word [offset + i - 1 ];
312314 if (fst .findTargetArc (ch , arc , arc , prefixReader ) == null ) {
313315 break ;
314316 } else if (arc .output () != NO_OUTPUT ) {
@@ -327,15 +329,17 @@ private List<CharsRef> stem(
327329 }
328330
329331 if (isAffixCompatible (prefix , prevFlag , recursionDepth , false )) {
330- char [] strippedWord = stripAffix (word , length , i , prefix , true );
332+ char [] strippedWord = stripAffix (word , offset , length , i , prefix , true );
331333 if (strippedWord == null ) {
332334 continue ;
333335 }
334336
337+ boolean pureAffix = strippedWord == word ;
335338 stems .addAll (
336339 applyAffix (
337340 strippedWord ,
338- strippedWord .length ,
341+ pureAffix ? offset + i : 0 ,
342+ pureAffix ? length - i : strippedWord .length ,
339343 prefix ,
340344 -1 ,
341345 recursionDepth ,
@@ -356,7 +360,7 @@ private List<CharsRef> stem(
356360 int limit = dictionary .fullStrip ? 0 : 1 ;
357361 for (int i = length ; i >= limit ; i --) {
358362 if (i < length ) {
359- int ch = word [i ];
363+ char ch = word [offset + i ];
360364 if (fst .findTargetArc (ch , arc , arc , suffixReader ) == null ) {
361365 break ;
362366 } else if (arc .output () != NO_OUTPUT ) {
@@ -375,15 +379,17 @@ private List<CharsRef> stem(
375379 }
376380
377381 if (isAffixCompatible (suffix , prevFlag , recursionDepth , previousWasPrefix )) {
378- char [] strippedWord = stripAffix (word , length , length - i , suffix , false );
382+ char [] strippedWord = stripAffix (word , offset , length , length - i , suffix , false );
379383 if (strippedWord == null ) {
380384 continue ;
381385 }
382386
387+ boolean pureAffix = strippedWord == word ;
383388 stems .addAll (
384389 applyAffix (
385390 strippedWord ,
386- strippedWord .length ,
391+ pureAffix ? offset : 0 ,
392+ pureAffix ? i : strippedWord .length ,
387393 suffix ,
388394 prefixId ,
389395 recursionDepth ,
@@ -398,7 +404,13 @@ private List<CharsRef> stem(
398404 return stems ;
399405 }
400406
401- private char [] stripAffix (char [] word , int length , int affixLen , int affix , boolean isPrefix ) {
407+ /**
408+ * @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
409+ * strip data and can thus be simply removed, or a new char[] containing the word affix
410+ * removal
411+ */
412+ private char [] stripAffix (
413+ char [] word , int offset , int length , int affixLen , int affix , boolean isPrefix ) {
402414 int deAffixedLen = length - affixLen ;
403415
404416 int stripOrd = dictionary .affixData (affix , Dictionary .AFFIX_STRIP_ORD );
@@ -409,15 +421,22 @@ private char[] stripAffix(char[] word, int length, int affixLen, int affix, bool
409421 char [] stripData = dictionary .stripData ;
410422 boolean condition =
411423 isPrefix
412- ? checkCondition (affix , stripData , stripStart , stripLen , word , affixLen , deAffixedLen )
413- : checkCondition (affix , word , 0 , deAffixedLen , stripData , stripStart , stripLen );
424+ ? checkCondition (
425+ affix , stripData , stripStart , stripLen , word , offset + affixLen , deAffixedLen )
426+ : checkCondition (affix , word , offset , deAffixedLen , stripData , stripStart , stripLen );
414427 if (!condition ) {
415428 return null ;
416429 }
417430
431+ if (stripLen == 0 ) return word ;
432+
418433 char [] strippedWord = new char [stripLen + deAffixedLen ];
419434 System .arraycopy (
420- word , isPrefix ? affixLen : 0 , strippedWord , isPrefix ? stripLen : 0 , deAffixedLen );
435+ word ,
436+ offset + (isPrefix ? affixLen : 0 ),
437+ strippedWord ,
438+ isPrefix ? stripLen : 0 ,
439+ deAffixedLen );
421440 System .arraycopy (stripData , stripStart , strippedWord , isPrefix ? 0 : deAffixedLen , stripLen );
422441 return strippedWord ;
423442 }
@@ -484,6 +503,7 @@ private boolean checkCondition(
484503 */
485504 private List <CharsRef > applyAffix (
486505 char [] strippedWord ,
506+ int offset ,
487507 int length ,
488508 int affix ,
489509 int prefixId ,
@@ -496,7 +516,7 @@ private List<CharsRef> applyAffix(
496516
497517 List <CharsRef > stems = new ArrayList <>();
498518
499- IntsRef forms = dictionary .lookupWord (strippedWord , 0 , length );
519+ IntsRef forms = dictionary .lookupWord (strippedWord , offset , length );
500520 if (forms != null ) {
501521 for (int i = 0 ; i < forms .length ; i += formStep ) {
502522 char [] wordFlags = dictionary .decodeFlags (forms .ints [forms .offset + i ], scratch );
@@ -530,7 +550,7 @@ private List<CharsRef> applyAffix(
530550 if (Dictionary .hasFlag (wordFlags , dictionary .onlyincompound )) {
531551 continue ;
532552 }
533- stems .add (newStem (strippedWord , length , forms , i ));
553+ stems .add (newStem (strippedWord , offset , length , forms , i ));
534554 }
535555 }
536556 }
@@ -572,6 +592,7 @@ private List<CharsRef> applyAffix(
572592 stems .addAll (
573593 stem (
574594 strippedWord ,
595+ offset ,
575596 length ,
576597 affix ,
577598 flag ,
0 commit comments