11-- | Parallel versions of 'filter' and 'simpleFilter'
2+
23module Text.Fuzzy.Parallel
34( filter ,
45 simpleFilter,
5- Scored (.. ),
6- -- reexports
7- Fuzzy ,
6+ match,
7+ Scored (.. )
88) where
99
10- import Control.Monad.ST (runST )
11- import Control.Parallel.Strategies (Eval , Strategy , evalTraversable ,
12- parTraversable , rseq , using )
13- import Data.Monoid.Textual (TextualMonoid )
14- import Data.Vector (Vector , (!) )
15- import qualified Data.Vector as V
16- -- need to use a stable sort
17- import Data.Bifunctor (second )
18- import Data.Char (toLower )
19- import Data.Maybe (fromMaybe )
20- import qualified Data.Monoid.Textual as T
10+ import Control.Parallel.Strategies (rseq , using , parList , evalList )
11+ import Data.Bits ((.|.) )
12+ import Data.Maybe (fromMaybe , mapMaybe )
13+ import qualified Data.Text as T
14+ import qualified Data.Text.Internal as T
15+ import qualified Data.Text.Array as TA
2116import Prelude hiding (filter )
22- import Text.Fuzzy (Fuzzy (.. ))
2317
24- data Scored a = Scored { score_ :: ! Int , original :: ! a }
25- deriving (Functor ,Show )
18+ data Scored a = Scored { score :: ! Int , original :: ! a }
19+ deriving (Functor , Show )
2620
2721-- | Returns the rendered output and the
2822-- matching score for a pattern and a text.
2923-- Two examples are given below:
3024--
31- -- >>> match "fnt" "infinite" "" "" id True
32- -- Just ("infinite",3)
25+ -- >>> match "fnt" "infinite"
26+ -- Just 3
3327--
34- -- >>> match "hsk" ( "Haskell",1995) "<" ">" fst False
35- -- Just ("<h>a<s><k>ell",5)
28+ -- >>> match "hsk" "Haskell"
29+ -- Just 5
3630--
3731{-# INLINABLE match #-}
3832
39- match :: (T. TextualMonoid s )
40- => s -- ^ Pattern in lowercase except for first character
41- -> t -- ^ The value containing the text to search in.
42- -> s -- ^ The text to add before each match.
43- -> s -- ^ The text to add after each match.
44- -> (t -> s ) -- ^ The function to extract the text from the container.
45- -> Maybe (Fuzzy t s ) -- ^ The original value, rendered string and score.
46- match pattern t pre post extract =
47- if null pat then Just (Fuzzy t result totalScore) else Nothing
33+ match :: T. Text -- ^ Pattern in lowercase except for first character
34+ -> T. Text -- ^ The text to search in.
35+ -> Maybe Int -- ^ The score
36+ match (T. Text pArr pOff pLen) (T. Text sArr sOff sLen) = go 0 1 pOff sOff
4837 where
49- null :: (T. TextualMonoid s ) => s -> Bool
50- null = not . T. any (const True )
51-
52- s = extract t
53- (totalScore, _currScore, result, pat, _) =
54- T. foldl'
55- undefined
56- (\ (tot, cur, res, pat, isFirst) c ->
57- case T. splitCharacterPrefix pat of
58- Nothing -> (tot, 0 , res <> T. singleton c, pat, isFirst)
59- Just (x, xs) ->
60- -- the case of the first character has to match
61- -- otherwise use lower case since the pattern is assumed lower
62- let ! c' = if isFirst then c else toLower c in
63- if x == c' then
64- let cur' = cur * 2 + 1 in
65- (tot + cur', cur', res <> pre <> T. singleton c <> post, xs, False )
66- else (tot, 0 , res <> T. singleton c, pat, isFirst)
67- ) ( 0
68- , 1 -- matching at the start gives a bonus (cur = 1)
69- , mempty , pattern , True ) s
38+ pTotal = pOff + pLen
39+ sDelta = sOff + sLen - pTotal
40+
41+ go ! totalScore ! currScore ! currPOff ! currSOff
42+ -- If pattern has been matched in full
43+ | currPOff >= pTotal
44+ = Just totalScore
45+ -- If there is not enough left to match the rest of the pattern, equivalent to
46+ -- (sOff + sLen - currSOff) < (pOff + pLen - currPOff)
47+ | currSOff > currPOff + sDelta
48+ = Nothing
49+ -- This is slightly broken for non-ASCII:
50+ -- 1. If code units, consisting a single pattern code point, are found as parts
51+ -- of different code points, it counts as a match. Unless you use a ton of emojis
52+ -- as identifiers, such false positives should not be be a big deal,
53+ -- and anyways HLS does not currently support such use cases, because it uses
54+ -- code point and UTF-16 code unit positions interchangeably.
55+ -- 2. Case conversions is not applied to non-ASCII code points, because one has
56+ -- to call T.toLower (not T.map toLower), reallocating the string in full, which
57+ -- is too much of performance penalty for fuzzy search. Again, anyway HLS does not
58+ -- attempt to do justice to Unicode: proper Unicode text matching requires
59+ -- `unicode-transforms` and friends.
60+ -- Altogether we sacrifice correctness for the sake of performance, which
61+ -- is a right trade-off for fuzzy search.
62+ | pByte <- TA. unsafeIndex pArr currPOff
63+ , sByte <- TA. unsafeIndex sArr currSOff
64+ -- First byte (currPOff == pOff) should match exactly, otherwise - up to case.
65+ , pByte == sByte || (currPOff /= pOff && pByte == toLowerAscii sByte)
66+ = let curr = currScore * 2 + 1 in
67+ go (totalScore + curr) curr (currPOff + 1 ) (currSOff + 1 )
68+ | otherwise
69+ = go totalScore 0 currPOff (currSOff + 1 )
70+
71+ toLowerAscii w = if (w - 65 ) < 26 then w .|. 0x20 else w
7072
7173-- | The function to filter a list of values by fuzzy search on the text extracted from them.
72- filter :: (TextualMonoid s )
73- => Int -- ^ Chunk size. 1000 works well.
74- -> Int -- ^ Max. number of results wanted
75- -> s -- ^ Pattern.
76- -> [t ] -- ^ The list of values containing the text to search in.
77- -> s -- ^ The text to add before each match.
78- -> s -- ^ The text to add after each match.
79- -> (t -> s ) -- ^ The function to extract the text from the container.
80- -> [Scored t ] -- ^ The list of results, sorted, highest score first.
81- filter chunkSize maxRes pattern ts pre post extract = runST $ do
82- let v = V. mapMaybe id
83- (V. map (\ t -> match pattern' t pre post extract) (V. fromList ts)
84- `using`
85- parVectorChunk chunkSize (evalTraversable forceScore))
86- perfectScore = score $ fromMaybe (error $ T. toString undefined pattern ) $
87- match pattern' pattern' " " " " id
88- return $ partialSortByAscScore maxRes perfectScore v
74+ filter :: Int -- ^ Chunk size. 1000 works well.
75+ -> Int -- ^ Max. number of results wanted
76+ -> T. Text -- ^ Pattern.
77+ -> [t ] -- ^ The list of values containing the text to search in.
78+ -> (t -> T. Text ) -- ^ The function to extract the text from the container.
79+ -> [Scored t ] -- ^ The list of results, sorted, highest score first.
80+ filter chunkSize maxRes pattern ts extract = partialSortByAscScore maxRes perfectScore (concat vss)
8981 where
9082 -- Preserve case for the first character, make all others lowercase
91- pattern' = case T. splitCharacterPrefix pattern of
92- Just (c, rest) -> T. singleton c <> T. map toLower rest
93- _ -> pattern
83+ pattern' = case T. uncons pattern of
84+ Just (c, rest) -> T. cons c (T. toLower rest)
85+ _ -> pattern
86+ vss = map (mapMaybe (\ t -> flip Scored t <$> match pattern' (extract t))) (chunkList chunkSize ts)
87+ `using` parList (evalList rseq)
88+ perfectScore = fromMaybe (error $ T. unpack pattern ) $ match pattern' pattern'
9489
9590-- | Return all elements of the list that have a fuzzy
9691-- match against the pattern. Runs with default settings where
@@ -99,84 +94,44 @@ filter chunkSize maxRes pattern ts pre post extract = runST $ do
9994-- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"]
10095-- ["vim","virtual machine"]
10196{-# INLINABLE simpleFilter #-}
102- simpleFilter :: (TextualMonoid s )
103- => Int -- ^ Chunk size. 1000 works well.
104- -> Int -- ^ Max. number of results wanted
105- -> s -- ^ Pattern to look for.
106- -> [s ] -- ^ List of texts to check.
107- -> [Scored s ] -- ^ The ones that match.
97+ simpleFilter :: Int -- ^ Chunk size. 1000 works well.
98+ -> Int -- ^ Max. number of results wanted
99+ -> T. Text -- ^ Pattern to look for.
100+ -> [T. Text ] -- ^ List of texts to check.
101+ -> [Scored T. Text ] -- ^ The ones that match.
108102simpleFilter chunk maxRes pattern xs =
109- filter chunk maxRes pattern xs mempty mempty id
110-
111- --------------------------------------------------------------------------------
112-
113- -- | Evaluation that forces the 'score' field
114- forceScore :: TextualMonoid s => Fuzzy t s -> Eval (Fuzzy t s )
115- forceScore it@ Fuzzy {score} = do
116- score' <- rseq score
117- return it{score = score'}
103+ filter chunk maxRes pattern xs id
118104
119105--------------------------------------------------------------------------------
120106
121- -- | Divides a vector in chunks, applies the strategy in parallel to each chunk.
122- parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a )
123- parVectorChunk chunkSize st v =
124- V. concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v)
125-
126- -- >>> chunkVector 3 (V.fromList [0..10])
127- -- >>> chunkVector 3 (V.fromList [0..11])
128- -- >>> chunkVector 3 (V.fromList [0..12])
129- -- [[0,1,2],[3,4,5],[6,7,8],[9,10]]
130- -- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]]
131- -- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]]
132- chunkVector :: Int -> Vector a -> [Vector a ]
133- chunkVector chunkSize v = do
134- let indices = chunkIndices chunkSize (0 ,V. length v)
135- [V. slice l (h- l+ 1 ) v | (l,h) <- indices]
136-
137- -- >>> chunkIndices 3 (0,9)
138- -- >>> chunkIndices 3 (0,10)
139- -- >>> chunkIndices 3 (0,11)
140- -- [(0,2),(3,5),(6,8)]
141- -- [(0,2),(3,5),(6,8),(9,9)]
142- -- [(0,2),(3,5),(6,8),(9,10)]
143- chunkIndices :: Int -> (Int ,Int ) -> [(Int ,Int )]
144- chunkIndices chunkSize (from,to) =
145- map (second pred ) $
146- pairwise $
147- [from, from+ chunkSize .. to- 1 ] ++ [to]
148-
149- pairwise :: [a ] -> [(a ,a )]
150- pairwise [] = []
151- pairwise [_] = []
152- pairwise (x: y: xs) = (x,y) : pairwise (y: xs)
107+ chunkList :: Int -> [a ] -> [[a ]]
108+ chunkList chunkSize = go
109+ where
110+ go [] = []
111+ go xs = ys : go zs
112+ where
113+ (ys, zs) = splitAt chunkSize xs
153114
154115-- | A stable partial sort ascending by score. O(N) best case, O(wanted*N) worst case
155- partialSortByAscScore :: TextualMonoid s
156- => Int -- ^ Number of items needed
116+ partialSortByAscScore
117+ :: Int -- ^ Number of items needed
157118 -> Int -- ^ Value of a perfect score
158- -> Vector (Fuzzy t s )
159119 -> [Scored t ]
160- partialSortByAscScore wantedCount perfectScore v = loop 0 ( SortState minBound perfectScore 0 ) [] where
161- l = V. length v
162- loop index st@ SortState {.. } acc
120+ -> [ Scored t ]
121+ partialSortByAscScore wantedCount perfectScore orig = loop orig ( SortState minBound perfectScore 0 ) [] where
122+ loop [] st@ SortState {.. } acc
163123 | foundCount == wantedCount = reverse acc
164- | index == l
165- -- ProgressCancelledException
166- = if bestScoreSeen < scoreWanted
167- then loop 0 st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound } acc
124+ | otherwise = if bestScoreSeen < scoreWanted
125+ then loop orig st{scoreWanted = bestScoreSeen, bestScoreSeen = minBound } acc
168126 else reverse acc
169- | otherwise =
170- case v! index of
171- x | score x == scoreWanted
172- -> loop (index+ 1 ) st{foundCount = foundCount+ 1 } (toScored x: acc)
173- | score x < scoreWanted && score x > bestScoreSeen
174- -> loop (index+ 1 ) st{bestScoreSeen = score x} acc
175- | otherwise
176- -> loop (index+ 1 ) st acc
177-
178- toScored :: TextualMonoid s => Fuzzy t s -> Scored t
179- toScored Fuzzy {.. } = Scored score original
127+ loop (x : xs) st@ SortState {.. } acc
128+ | foundCount == wantedCount = reverse acc
129+ | score x == scoreWanted
130+ = loop xs st{foundCount = foundCount+ 1 } (x: acc)
131+ | score x < scoreWanted && score x > bestScoreSeen
132+ = loop xs st{bestScoreSeen = score x} acc
133+ | otherwise
134+ = loop xs st acc
180135
181136data SortState a = SortState
182137 { bestScoreSeen :: ! Int
0 commit comments