From 5b169ece20decd6ada765842139500605362496d Mon Sep 17 00:00:00 2001 From: Daniel Bramucci Date: Tue, 27 Oct 2020 22:47:27 -0500 Subject: [PATCH 1/6] Made `BS.hGetLine`'s behavior like `hGetLine`. This closes issue #13. The changes can be summarized as updating `findEOL` to look for "\r\n" in CRLF mode and updating the logic of `haveBuf` to resize the buffer according to the size of the newline. Additionally, tests were added to verify that both `hGetLine`s produce the same behavior. Some of the edge-cases to worry about here include * '\n' still counts as a line end. Thus line endings' length vary between 1 and 2 in CRLF mode. * "\r\r\n" can give a false-start. This means you can't always skip 2 characters when `c' /= '\n'`. * '\r' when not followed by '\n' isn't part of a newline. * Not reading out of the buffer when '\r' is the last character. --- Data/ByteString.hs | 37 ++++++++---- tests/HGetLine.hs | 105 +++++++++++++++++++++++++++++++++++ tests/bytestring-tests.cabal | 21 +++++++ 3 files changed, 153 insertions(+), 10 deletions(-) create mode 100644 tests/HGetLine.hs diff --git a/Data/ByteString.hs b/Data/ByteString.hs index 2d8b337c5..071b9b0d9 100644 --- a/Data/ByteString.hs +++ b/Data/ByteString.hs @@ -1816,31 +1816,48 @@ hGetLine h = else ioe_EOF else haveBuf h_ buf' len xss - haveBuf h_@Handle__{haByteBuffer} + haveBuf h_@Handle__{haByteBuffer, haInputNL} buf@Buffer{ bufRaw=raw, bufR=w, bufL=r } len xss = do - off <- findEOL r w raw + (off, sizeNewline) <- findEOL haInputNL r w raw let new_len = len + off - r xs <- mkPS raw r off -- if eol == True, then off is the offset of the '\n' -- otherwise off == w and the buffer is now empty. if off /= w - then do if w == off + 1 - then writeIORef haByteBuffer buf{ bufL=0, bufR=0 } - else writeIORef haByteBuffer buf{ bufL = off + 1 } - mkBigPS new_len (xs:xss) + then do + -- If off + sizeNewline == w then the remaining buffer is empty + if (off + sizeNewline) == w + then writeIORef haByteBuffer buf{ bufL=0, bufR=0 } + else writeIORef haByteBuffer buf{ bufL = off + sizeNewline } + mkBigPS new_len (xs:xss) else fill h_ buf{ bufL=0, bufR=0 } new_len (xs:xss) -- find the end-of-line character, if there is one - findEOL r w raw - | r == w = return w + findEOL haInputNL r w raw + | r == w = return (w, 0) | otherwise = do c <- readWord8Buf raw r if c == fromIntegral (ord '\n') - then return r -- NB. not r+1: don't include the '\n' - else findEOL (r+1) w raw + then do + -- NB. not r+1: don't include the '\n' + -- Also, it is important that it ends the line in both modes + -- To match System.IO.hGetLine's behavior + return (r, 1) + else if haInputNL == CRLF && c == fromIntegral (ord '\r') && r+1 < w + then do + c' <- readWord8Buf raw (r+1) + if c' == fromIntegral (ord '\n') + then return (r, 2) -- NB. not r+1 or r+2: don't include the '\r\n' + else do + -- We cannot jump 2 characters ahead + -- because if we encountered '\r\r\n' + -- We would miss the pattern starting on the second '\r' + findEOL haInputNL (r+1) w raw + else findEOL haInputNL (r+1) w raw + mkPS :: RawBuffer Word8 -> Int -> Int -> IO ByteString mkPS buf start end = diff --git a/tests/HGetLine.hs b/tests/HGetLine.hs new file mode 100644 index 000000000..2607165fa --- /dev/null +++ b/tests/HGetLine.hs @@ -0,0 +1,105 @@ +{-# LANGUAGE BangPatterns, ScopedTypeVariables #-} +import qualified Data.ByteString.Char8 as S8 + + +import Test.HUnit (assertEqual, assertBool) +import qualified Test.Framework as F +import qualified Test.Framework.Providers.HUnit as F + +import Control.Monad +import System.IO + + +testfile_eof :: FilePath +testfile_eof = "line-endings_eof.txt" + +testfile_lf_eof :: FilePath +testfile_lf_eof = "line-endings_lf_eof.txt" + +testfile_cr_eof :: FilePath +testfile_cr_eof = "line-endings_cr_eof.txt" + +testfile_crlf_eof :: FilePath +testfile_crlf_eof = "line-endings_crlf_eof.txt" + + +testString :: String +testString = concat [ + "This file\r\n" + , "tests how hGetLine\r\r\n" + , " handles\n" + , " newlines.\r\n" + , "It is intentionally\n\r" + , "inconsistent\r\n\r\n \r\n" + , "in how it\r" + , "handles newlines.\r\n\n" + , "If it was in the git repo\n\n" + , "it would need to be marked in\r\r" + , "in .gitattributes as a binary file\n" + , "to stop\n\r\r\n" + , "git from changing its endings" + ] + +writeTestFiles :: IO () +writeTestFiles = do + writeFile testfile_eof $ testString + writeFile testfile_lf_eof $ testString <> "\n" + writeFile testfile_cr_eof $ testString <> "\r" + writeFile testfile_crlf_eof $ testString <> "\r\n" + + +readByLinesBS :: Handle -> IO [S8.ByteString] +readByLinesBS h_ = go [] + where + go lines = do + isEnd <- hIsEOF h_ + if isEnd + then return $ reverse lines + else do + !nextLine <- S8.hGetLine h_ + go (nextLine : lines) + +readByLinesS :: Handle -> IO [String] +readByLinesS h_ = go [] + where + go lines = do + isEnd <- hIsEOF h_ + if isEnd + then return $ reverse lines + else do + !nextLine <- hGetLine h_ + go (nextLine : lines) + +hgetline_like_s8_hgetline :: IO () +hgetline_like_s8_hgetline = + mapM_ (uncurry hgetline_like_s8_hgetline') $ do + file <- [testfile_eof, testfile_lf_eof, testfile_cr_eof, testfile_crlf_eof] + linemode <- [NewlineMode LF LF, NewlineMode CRLF CRLF, NewlineMode LF CRLF, NewlineMode CRLF LF] + return (file, linemode) + + +hgetline_like_s8_hgetline' :: FilePath -> NewlineMode -> IO () +hgetline_like_s8_hgetline' file newlineMode = do + bsLines <- withFile file ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLinesBS h + ) + sLines <- withFile file ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLinesS h + ) + assertEqual ("unpacking S8.hGetLines should equal hGetLines for newlineMode " <> show newlineMode <> " for file " <> file) + (map S8.unpack bsLines) + sLines + assertBool "The test file for hGetLine sshould not be empty" $ bsLines /= [] + + +tests :: [F.Test] +tests = [ + F.testCase "hgetline_like_s8_hgetline" hgetline_like_s8_hgetline + ] + +main :: IO () +main = do + writeTestFiles + F.defaultMain tests diff --git a/tests/bytestring-tests.cabal b/tests/bytestring-tests.cabal index 6f4dd55d2..b3720fb70 100644 --- a/tests/bytestring-tests.cabal +++ b/tests/bytestring-tests.cabal @@ -71,6 +71,27 @@ test-suite lazy-hclose -fno-enable-rewrite-rules -threaded -rtsopts +test-suite hgetline + type: exitcode-stdio-1.0 + main-is: HGetLine.hs + other-modules: Data.ByteString + Data.ByteString.Internal + Data.ByteString.Unsafe + hs-source-dirs: . .. + build-depends: base, ghc-prim, deepseq, random, directory, + test-framework, test-framework-hunit, HUnit + c-sources: ../cbits/fpstring.c + include-dirs: ../include + cpp-options: -DHAVE_TEST_FRAMEWORK=1 + ghc-options: -fwarn-unused-binds + -fno-enable-rewrite-rules + -threaded -rtsopts + extensions: BangPatterns + UnliftedFFITypes, + MagicHash, + ScopedTypeVariables + NamedFieldPuns + executable regressions main-is: Regressions.hs other-modules: Data.ByteString From a70a788e123eb4f2c6adbfd3f17d9e5d82cba473 Mon Sep 17 00:00:00 2001 From: Daniel Bramucci Date: Fri, 6 Nov 2020 21:52:52 -0600 Subject: [PATCH 2/6] Added the property that hGetLine agrees with base --- tests/Properties.hs | 28 +++++++++++++++++++++++++++- tests/QuickCheckUtils.hs | 16 ++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/tests/Properties.hs b/tests/Properties.hs index 5d954cdeb..4513fc843 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -1614,6 +1614,31 @@ prop_read_write_file_D x = unsafePerformIO $ do (const $ do y <- D.readFile f return (x==y)) +prop_hgetline_like_s8_hgetline (LinedASCII filetext) (lineEndIn, lineEndOut) = idempotentIOProperty $ do + let testFileName = "testdata.txt" + let newlineMode = NewlineMode (if lineEndIn then LF else CRLF) (if lineEndOut then LF else CRLF) + writeFile testFileName filetext + bsLines <- withFile testFileName ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLines C.hGetLine h + ) + sLines <- withFile testFileName ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLines System.IO.hGetLine h + ) + return $ map C.unpack bsLines === sLines + where + readByLines getLine h_ = go [] + where + go lines = do + isEnd <- hIsEOF h_ + if isEnd + then return lines + else do + !nextLine <- getLine h_ + go (nextLine : lines) + + ------------------------------------------------------------------------ prop_append_file_P x y = unsafePerformIO $ do @@ -1791,7 +1816,8 @@ io_tests = , testProperty "appendFile " prop_append_file_D , testProperty "packAddress " prop_packAddress - + + , testProperty "pack.hGetLine=hGetLine" prop_hgetline_like_s8_hgetline ] misc_tests = diff --git a/tests/QuickCheckUtils.hs b/tests/QuickCheckUtils.hs index a5ff8ccfc..22d710a3f 100644 --- a/tests/QuickCheckUtils.hs +++ b/tests/QuickCheckUtils.hs @@ -93,6 +93,22 @@ instance Arbitrary String8 where toChar :: Word8 -> Char toChar = toEnum . fromIntegral +-- | Strings, but each char is ASCII and there are a lot of newlines generated +-- +newtype LinedASCII = LinedASCII String + deriving (Eq, Ord, Show) + +instance Arbitrary LinedASCII where + arbitrary = fmap LinedASCII . listOf . oneof $ + [ arbitraryASCIIChar + , elements ['\n', '\r'] + ] + + shrink (LinedASCII s) = fmap LinedASCII (shrink s) + +instance CoArbitrary LinedASCII where + coarbitrary (LinedASCII s) = coarbitrary s + ------------------------------------------------------------------------ -- -- We're doing two forms of testing here. Firstly, model based testing. From 537080cddb9640c868e23746c02fd8f9877a12f5 Mon Sep 17 00:00:00 2001 From: Daniel Bramucci Date: Fri, 6 Nov 2020 22:01:40 -0600 Subject: [PATCH 3/6] Removed the redundant test for hGetLine. The old test had wrote a special file filled with strange line endings. Now that there is a reliable, and consistent property test available for hGetLine, this code can be removed at little cost. --- tests/HGetLine.hs | 105 ----------------------------------- tests/bytestring-tests.cabal | 21 ------- 2 files changed, 126 deletions(-) delete mode 100644 tests/HGetLine.hs diff --git a/tests/HGetLine.hs b/tests/HGetLine.hs deleted file mode 100644 index 2607165fa..000000000 --- a/tests/HGetLine.hs +++ /dev/null @@ -1,105 +0,0 @@ -{-# LANGUAGE BangPatterns, ScopedTypeVariables #-} -import qualified Data.ByteString.Char8 as S8 - - -import Test.HUnit (assertEqual, assertBool) -import qualified Test.Framework as F -import qualified Test.Framework.Providers.HUnit as F - -import Control.Monad -import System.IO - - -testfile_eof :: FilePath -testfile_eof = "line-endings_eof.txt" - -testfile_lf_eof :: FilePath -testfile_lf_eof = "line-endings_lf_eof.txt" - -testfile_cr_eof :: FilePath -testfile_cr_eof = "line-endings_cr_eof.txt" - -testfile_crlf_eof :: FilePath -testfile_crlf_eof = "line-endings_crlf_eof.txt" - - -testString :: String -testString = concat [ - "This file\r\n" - , "tests how hGetLine\r\r\n" - , " handles\n" - , " newlines.\r\n" - , "It is intentionally\n\r" - , "inconsistent\r\n\r\n \r\n" - , "in how it\r" - , "handles newlines.\r\n\n" - , "If it was in the git repo\n\n" - , "it would need to be marked in\r\r" - , "in .gitattributes as a binary file\n" - , "to stop\n\r\r\n" - , "git from changing its endings" - ] - -writeTestFiles :: IO () -writeTestFiles = do - writeFile testfile_eof $ testString - writeFile testfile_lf_eof $ testString <> "\n" - writeFile testfile_cr_eof $ testString <> "\r" - writeFile testfile_crlf_eof $ testString <> "\r\n" - - -readByLinesBS :: Handle -> IO [S8.ByteString] -readByLinesBS h_ = go [] - where - go lines = do - isEnd <- hIsEOF h_ - if isEnd - then return $ reverse lines - else do - !nextLine <- S8.hGetLine h_ - go (nextLine : lines) - -readByLinesS :: Handle -> IO [String] -readByLinesS h_ = go [] - where - go lines = do - isEnd <- hIsEOF h_ - if isEnd - then return $ reverse lines - else do - !nextLine <- hGetLine h_ - go (nextLine : lines) - -hgetline_like_s8_hgetline :: IO () -hgetline_like_s8_hgetline = - mapM_ (uncurry hgetline_like_s8_hgetline') $ do - file <- [testfile_eof, testfile_lf_eof, testfile_cr_eof, testfile_crlf_eof] - linemode <- [NewlineMode LF LF, NewlineMode CRLF CRLF, NewlineMode LF CRLF, NewlineMode CRLF LF] - return (file, linemode) - - -hgetline_like_s8_hgetline' :: FilePath -> NewlineMode -> IO () -hgetline_like_s8_hgetline' file newlineMode = do - bsLines <- withFile file ReadMode (\h -> do - hSetNewlineMode h newlineMode - readByLinesBS h - ) - sLines <- withFile file ReadMode (\h -> do - hSetNewlineMode h newlineMode - readByLinesS h - ) - assertEqual ("unpacking S8.hGetLines should equal hGetLines for newlineMode " <> show newlineMode <> " for file " <> file) - (map S8.unpack bsLines) - sLines - assertBool "The test file for hGetLine sshould not be empty" $ bsLines /= [] - - -tests :: [F.Test] -tests = [ - F.testCase "hgetline_like_s8_hgetline" hgetline_like_s8_hgetline - ] - -main :: IO () -main = do - writeTestFiles - F.defaultMain tests diff --git a/tests/bytestring-tests.cabal b/tests/bytestring-tests.cabal index b3720fb70..6f4dd55d2 100644 --- a/tests/bytestring-tests.cabal +++ b/tests/bytestring-tests.cabal @@ -71,27 +71,6 @@ test-suite lazy-hclose -fno-enable-rewrite-rules -threaded -rtsopts -test-suite hgetline - type: exitcode-stdio-1.0 - main-is: HGetLine.hs - other-modules: Data.ByteString - Data.ByteString.Internal - Data.ByteString.Unsafe - hs-source-dirs: . .. - build-depends: base, ghc-prim, deepseq, random, directory, - test-framework, test-framework-hunit, HUnit - c-sources: ../cbits/fpstring.c - include-dirs: ../include - cpp-options: -DHAVE_TEST_FRAMEWORK=1 - ghc-options: -fwarn-unused-binds - -fno-enable-rewrite-rules - -threaded -rtsopts - extensions: BangPatterns - UnliftedFFITypes, - MagicHash, - ScopedTypeVariables - NamedFieldPuns - executable regressions main-is: Regressions.hs other-modules: Data.ByteString From a3d4c5fe3ce62ec6da67b8c3c3ddab8f4a68a800 Mon Sep 17 00:00:00 2001 From: Daniel Bramucci Date: Sat, 21 Nov 2020 20:48:33 -0600 Subject: [PATCH 4/6] Made hgetline test manage files like other IO tests --- tests/Properties.hs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/Properties.hs b/tests/Properties.hs index 4513fc843..41e34915f 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -1615,18 +1615,23 @@ prop_read_write_file_D x = unsafePerformIO $ do return (x==y)) prop_hgetline_like_s8_hgetline (LinedASCII filetext) (lineEndIn, lineEndOut) = idempotentIOProperty $ do - let testFileName = "testdata.txt" + tid <- myThreadId + let f = "qc-test-"++show tid let newlineMode = NewlineMode (if lineEndIn then LF else CRLF) (if lineEndOut then LF else CRLF) - writeFile testFileName filetext - bsLines <- withFile testFileName ReadMode (\h -> do - hSetNewlineMode h newlineMode - readByLines C.hGetLine h - ) - sLines <- withFile testFileName ReadMode (\h -> do - hSetNewlineMode h newlineMode - readByLines System.IO.hGetLine h - ) - return $ map C.unpack bsLines === sLines + bracket_ + (writeFile f filetext) + (removeFile f) + $ do + bsLines <- withFile f ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLines C.hGetLine h + ) + sLines <- withFile f ReadMode (\h -> do + hSetNewlineMode h newlineMode + readByLines System.IO.hGetLine h + ) + return $ map C.unpack bsLines === sLines + where readByLines getLine h_ = go [] where From 6f6a09875746ffc1cce595b7cc0f03ea63b48130 Mon Sep 17 00:00:00 2001 From: Daniel Bramucci Date: Fri, 27 Nov 2020 00:04:25 -0600 Subject: [PATCH 5/6] Cleaned up prop_hgetline_like_s8_hgetline and fixed a Windows testing bug. On Windows, the test data would be written using the platform newlines. This means that any lone \n would become a \r\n. The consequence is that the property would fail to test the implementation on linux line endings when developing on windows. The fix is to set the newlineMode to noNewlineTranslation before writing the test data. --- tests/Properties.hs | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/tests/Properties.hs b/tests/Properties.hs index 41e34915f..4348fa7c1 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -1615,33 +1615,24 @@ prop_read_write_file_D x = unsafePerformIO $ do return (x==y)) prop_hgetline_like_s8_hgetline (LinedASCII filetext) (lineEndIn, lineEndOut) = idempotentIOProperty $ do - tid <- myThreadId - let f = "qc-test-"++show tid - let newlineMode = NewlineMode (if lineEndIn then LF else CRLF) (if lineEndOut then LF else CRLF) - bracket_ - (writeFile f filetext) - (removeFile f) - $ do - bsLines <- withFile f ReadMode (\h -> do - hSetNewlineMode h newlineMode - readByLines C.hGetLine h - ) - sLines <- withFile f ReadMode (\h -> do - hSetNewlineMode h newlineMode - readByLines System.IO.hGetLine h - ) - return $ map C.unpack bsLines === sLines - + (fn, h) <- openTempFile "." "hgetline-prop-test.tmp" + hSetNewlineMode h noNewlineTranslation -- This is to ensure strings like \n are covered on Windows. + hPutStr h filetext + hClose h + bsLines <- readFileByLines C.hGetLine fn + sLines <- readFileByLines System.IO.hGetLine fn + removeFile fn + return (map C.unpack bsLines === sLines) where - readByLines getLine h_ = go [] - where - go lines = do - isEnd <- hIsEOF h_ - if isEnd - then return lines - else do - !nextLine <- getLine h_ - go (nextLine : lines) + newlineMode = NewlineMode (if lineEndIn then LF else CRLF) (if lineEndOut then LF else CRLF) + readFileByLines getLine fn = withFile fn ReadMode $ \h -> do + hSetNewlineMode h newlineMode + readByLines getLine h + readByLines getLine h = do + isEnd <- hIsEOF h + if isEnd + then return [] + else (:) <$> getLine h <*> readByLines getLine h ------------------------------------------------------------------------ From 3e34f487b89d7d439952ca3e2288c33b63bc7d57 Mon Sep 17 00:00:00 2001 From: Daniel Bramucci Date: Fri, 27 Nov 2020 02:25:26 -0600 Subject: [PATCH 6/6] Slight tweak to hgetline property's newlines. The variables were renamed to make boolean correspondance clearer. Also, True was changed to CRLF in order to get QuickCheck to try shrinking from CRLF to LF if possible. --- tests/Properties.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/Properties.hs b/tests/Properties.hs index 4348fa7c1..bb41e8899 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -1614,7 +1614,7 @@ prop_read_write_file_D x = unsafePerformIO $ do (const $ do y <- D.readFile f return (x==y)) -prop_hgetline_like_s8_hgetline (LinedASCII filetext) (lineEndIn, lineEndOut) = idempotentIOProperty $ do +prop_hgetline_like_s8_hgetline (LinedASCII filetext) (crlfIn, crlfOut) = idempotentIOProperty $ do (fn, h) <- openTempFile "." "hgetline-prop-test.tmp" hSetNewlineMode h noNewlineTranslation -- This is to ensure strings like \n are covered on Windows. hPutStr h filetext @@ -1624,7 +1624,7 @@ prop_hgetline_like_s8_hgetline (LinedASCII filetext) (lineEndIn, lineEndOut) = i removeFile fn return (map C.unpack bsLines === sLines) where - newlineMode = NewlineMode (if lineEndIn then LF else CRLF) (if lineEndOut then LF else CRLF) + newlineMode = NewlineMode (if crlfIn then CRLF else LF) (if crlfOut then CRLF else LF) readFileByLines getLine fn = withFile fn ReadMode $ \h -> do hSetNewlineMode h newlineMode readByLines getLine h