11{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-}
2- {-# LANGUAGE Unsafe #-}
2+ {-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-}
33{-# OPTIONS_HADDOCK not-home #-}
44-- | Copyright : (c) 2010 - 2011 Simon Meier
55-- License : BSD3-style (see LICENSE)
@@ -84,6 +84,8 @@ module Data.ByteString.Builder.Internal (
8484 -- , sizedChunksInsert
8585
8686 , byteStringCopy
87+ , ascLiteralCopy
88+ , modUtf8LitCopy
8789 , byteStringInsert
8890 , byteStringThreshold
8991
@@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal (
127129) where
128130
129131import Control.Arrow (second )
132+ import Control.Monad (when )
130133
131134import Data.Semigroup (Semigroup (.. ))
132135
@@ -138,10 +141,12 @@ import qualified Data.ByteString.Short.Internal as Sh
138141import qualified GHC.IO.Buffer as IO (Buffer (.. ), newByteBuffer )
139142import GHC.IO.Handle.Internals (wantWritableHandle , flushWriteBuffer )
140143import GHC.IO.Handle.Types (Handle__ , haByteBuffer , haBufferMode )
144+ import GHC.Ptr (Ptr (.. ))
141145import System.IO (hFlush , BufferMode (.. ), Handle )
142146import Data.IORef
143147
144148import Foreign
149+ import Foreign.C.String (CString )
145150import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr )
146151import System.IO.Unsafe (unsafeDupablePerformIO )
147152
@@ -874,6 +879,75 @@ byteStringInsert :: S.ByteString -> Builder
874879byteStringInsert =
875880 \ bs -> builder $ \ k (BufferRange op _) -> return $ insertChunk op bs k
876881
882+
883+ ------------------------------------------------------------------------------
884+ -- Raw CString encoding
885+ ------------------------------------------------------------------------------
886+
887+ -- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
888+ -- strings that are free of embedded (overlong-encoded as the two-byte sequence
889+ -- @0xC0 0x80@) null characters.
890+ --
891+ -- @since 0.11.5.0
892+ {-# INLINABLE ascLiteralCopy #-}
893+ ascLiteralCopy :: Ptr Word8 -> Int -> Builder
894+ ascLiteralCopy = \ ! ip ! len -> builder $ \ k br -> do
895+ let ! ipe = ip `plusPtr` len
896+ wrappedBytesCopyStep (BufferRange ip ipe) k br
897+
898+ -- | GHC represents @NUL@ in string literals via an overlong 2-byte encoding,
899+ -- which is part of "modified UTF-8" (GHC does not also implement CESU-8).
900+ modifiedUtf8NUL :: CString
901+ modifiedUtf8NUL = Ptr " \xc0\x80 " #
902+
903+ -- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
904+ -- encoded strings that may contain embedded overlong-encodings (as the
905+ -- two-byte sequence @0xC0 0x80@) of null characters.
906+ --
907+ -- @since 0.11.5.0
908+ {-# INLINABLE modUtf8LitCopy #-}
909+ modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
910+ modUtf8LitCopy = \ ! ip ! len -> builder $ \ k br -> do
911+ nullAt <- c_strstr (castPtr ip) modifiedUtf8NUL
912+ modUtf8_step ip len nullAt k br
913+
914+ modUtf8_step :: Ptr Word8 -> Int -> Ptr Word8 -> BuildStep r -> BuildStep r
915+ modUtf8_step ! ip ! len ((== nullPtr) -> True ) k br =
916+ -- Contains no encoded nulls, use simple copy codepath
917+ wrappedBytesCopyStep (BufferRange ip ipe) k br
918+ where
919+ ! ipe = ip `plusPtr` len
920+ modUtf8_step ! ip ! len ! nullAt k (BufferRange op0 ope)
921+ -- Copy as much of the null-free portion of the string as fits into the
922+ -- available buffer space. If the string is long enough, we may have asked
923+ -- for less than its full length, filling the buffer with the rest will go
924+ -- into the next builder step.
925+ | avail > nullFree = do
926+ when (nullFree > 0 ) (copyBytes op0 ip nullFree)
927+ pokeElemOff op0 nullFree 0
928+ let used = nullFree + 2
929+ len' = len - used
930+ ! ip' = ip `plusPtr` used
931+ ! op' = op0 `plusPtr` (nullFree + 1 )
932+ nullAt' <- c_strstr ip' modifiedUtf8NUL
933+ modUtf8_step ip' len' nullAt' k (BufferRange op' ope)
934+ | avail > 0 = do
935+ -- avail <= nullFree
936+ copyBytes op0 ip avail
937+ let len' = len - avail
938+ ! ip' = ip `plusPtr` avail
939+ ! op' = op0 `plusPtr` avail
940+ return $ bufferFull 1 op' (modUtf8_step ip' len' nullAt k)
941+ | otherwise =
942+ return $ bufferFull 1 op0 (modUtf8_step ip len nullAt k)
943+ where
944+ ! avail = ope `minusPtr` op0
945+ ! nullFree = nullAt `minusPtr` ip
946+
947+ foreign import ccall unsafe " string.h strstr" c_strstr
948+ :: CString -> CString -> IO (Ptr Word8 )
949+
950+
877951-- Short bytestrings
878952------------------------------------------------------------------------------
879953
0 commit comments