@@ -190,8 +190,25 @@ encodeLatin1Lax = encodeLatin1
190190-- UTF-8 decoding
191191-------------------------------------------------------------------------------
192192
193- -- Int helps in cheaper conversion from Int to Char
193+ -- | CodePoint represents a specific character in the Unicode standard.
194+ -- The code point is a numerical value assigned to each character,
195+ -- and UTF-8 encoding uses a variable number of bytes to represent
196+ -- different code points.
197+ --
198+ -- Calculate the code point value: Depending on the type of the leading byte,
199+ -- extract the significant bits from each byte of the sequence and combine them
200+ -- to form the complete code point value. The specific bit manipulations will
201+ -- differ based on the number of bytes used.
194202type CodePoint = Int
203+
204+ -- | DecodeState refers to the number of bytes remaining to complete the current
205+ -- UTF-8 character decoding. For ASCII characters (code points 0 to 127),
206+ -- no decoding state is necessary because they are represented by a single byte.
207+ -- Therefore, the decoding state for ASCII characters can be considered as 0.
208+ -- For multi-byte characters, the decoding state indicates the number of bytes
209+ -- remaining to complete the character. It is usually initialized to a non-zero
210+ -- value corresponding to the number of bytes in the multi-byte character, e.g
211+ -- DecodeState will be 1 for 2-bytes char.
195212type DecodeState = Word8
196213
197214-- We can divide the errors in three general categories:
@@ -410,17 +427,24 @@ decodeUtf8EitherD :: Monad m
410427 => D. Stream m Word8 -> D. Stream m (Either DecodeError Char )
411428decodeUtf8EitherD = resumeDecodeUtf8EitherD 0 0
412429
413- -- |
430+ -- | Decode a bytestream as UTF-8 encoded characters, returning an 'Either'
431+ -- stream.
432+ --
433+ -- This function is similar to 'decodeUtf8', but instead of replacing the
434+ -- invalid codepoint encountered, it returns a 'Left' 'DecodeError'.
435+ --
436+ -- When decoding is successful and a valid character is encountered, the
437+ -- function returns 'Right Char'.
414438--
415- -- /Pre-release/
416439{-# INLINE decodeUtf8Either #-}
417440decodeUtf8Either :: Monad m
418441 => Stream m Word8 -> Stream m (Either DecodeError Char )
419442decodeUtf8Either = decodeUtf8EitherD
420443
421- -- |
444+ -- | Resuming the decoding of a bytestream given a 'DecodeState' and a
445+ -- 'CodePoint'.
422446--
423- -- /Pre-release/
447+ -- >>> decodeUtf8Either = resumeDecodeUtf8Either 0 0
424448{-# INLINE resumeDecodeUtf8Either #-}
425449resumeDecodeUtf8Either
426450 :: Monad m
0 commit comments