@@ -26,8 +26,8 @@ import org.apache.commons.text.StringEscapeUtils
2626import org .apache .pekko .NotUsed
2727import org .apache .pekko .actor .ActorSystem
2828import org .apache .pekko .http .scaladsl .Http
29- import org .apache .pekko .http .scaladsl .model .sse .ServerSentEvent
3029import org .apache .pekko .http .scaladsl .model .*
30+ import org .apache .pekko .http .scaladsl .model .sse .ServerSentEvent
3131import org .apache .pekko .http .scaladsl .server .Directives .{as , complete , concat , entity , get , getFromFile , onComplete , path , pathEndOrSingleSlash , pathPrefix , post }
3232import org .apache .pekko .http .scaladsl .server .Route
3333import org .apache .pekko .http .scaladsl .unmarshalling .Unmarshal
@@ -275,6 +275,15 @@ object WikipediaEditsAnalyser extends App {
275275 }
276276 }
277277
278+ private def sanitizePersonNames (names : List [String ]): List [String ] = {
279+ names
280+ .map(each => StringEscapeUtils .unescapeJava(each))
281+ // Keep name related content (letters, whitespace, apostrophes, periods, hyphens)
282+ .map(_.replaceAll(" [^\\ p{L}\\ s'.\\ -]" , " " ))
283+ .map(StringUtils .trim)
284+ .filter(StringUtils .isNotBlank)
285+ }
286+
278287 private def findPersonsLocalNER (ctx : Ctx ): Future [Ctx ] = {
279288 logger.info(s " [ ${ctx.traceId}] Local NER: About to find person names in: ${ctx.change.title}" )
280289 val content = ctx.content
@@ -293,12 +302,7 @@ object WikipediaEditsAnalyser extends App {
293302 if (personsFound.isEmpty) {
294303 Future (ctx)
295304 } else {
296- val personsFoundCleaned = personsFound
297- .map(each => StringEscapeUtils .unescapeJava(each))
298- // Keep name related content (letters, whitespace, apostrophes, periods, hyphens)
299- .map(_.replaceAll(" [^\\ p{L}\\ s'.\\ -]" , " " ))
300- .map(StringUtils .trim)
301- .filter(StringUtils .isNotBlank)
305+ val personsFoundCleaned = sanitizePersonNames(personsFound)
302306
303307 logger.debug(s " [ ${ctx.traceId}] Local NER found persons: $personsFoundCleaned from content: $content" )
304308 Future (ctx.copy(personsFoundLocal = personsFoundCleaned))
@@ -361,11 +365,12 @@ object WikipediaEditsAnalyser extends App {
361365 val personsFoundList = if (personsFoundText.isEmpty || personsFoundText.equalsIgnoreCase(" NONE" )) {
362366 List .empty[String ]
363367 } else {
364- personsFoundText.split(" \n " )
368+ val rawNames = personsFoundText.split(" \n " )
365369 .map(_.trim)
366370 .filter(_.nonEmpty)
367371 .filter(! _.equalsIgnoreCase(" NONE" ))
368372 .toList
373+ sanitizePersonNames(rawNames)
369374 }
370375
371376 if (personsFoundList.isEmpty) {
@@ -586,8 +591,9 @@ object WikipediaEditsAnalyser extends App {
586591 def disableProcessing (): ProcessingControlResponse = {
587592 if (isProcessingEnabled.get()) {
588593 isProcessingEnabled.set(false )
589- logger.info(" Processing disabled - suspending LLM calls and indexing (flow continues)" )
590- ProcessingControlResponse (false , " Processing disabled - suspending LLM calls and indexing" )
594+ val msg = " Processing disabled - suspending LLM calls and indexing (flow and local NER continues)"
595+ logger.info(msg)
596+ ProcessingControlResponse (false , msg)
591597 } else {
592598 ProcessingControlResponse (false , " Processing already disabled" )
593599 }
@@ -660,7 +666,7 @@ object WikipediaEditsAnalyser extends App {
660666 },
661667 get {
662668 val response = ProcessingControlResponse (isProcessingEnabled.get(),
663- if (isProcessingEnabled.get()) " Processing enabled" else " Processing disabled" )
669+ if (isProcessingEnabled.get()) " Processing enabled" else " Processing disabled (local NER active) " )
664670 complete(HttpEntity (ContentTypes .`application/json`, response.asJson.noSpaces))
665671 }
666672 )
0 commit comments