{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
    UnliftedFFITypes #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
-- |
-- Module      : Data.Text.Encoding
-- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan,
--               (c) 2009 Duncan Coutts,
--               (c) 2008, 2009 Tom Harper
--               (c) 2021 Andrew Lelechenko
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Portability : portable
--
-- Functions for converting 'Text' values to and from 'ByteString',
-- using several standard encodings.
--
-- To gain access to a much larger family of encodings, use the
-- <http://hackage.haskell.org/package/text-icu text-icu package>.

module Data.Text.Encoding
    (
    -- * Decoding ByteStrings to Text
    -- $strict
      decodeASCII
    , decodeLatin1
    , decodeUtf8
    , decodeUtf16LE
    , decodeUtf16BE
    , decodeUtf32LE
    , decodeUtf32BE

    -- ** Catchable failure
    , decodeUtf8'

    -- ** Controllable error handling
    , decodeUtf8With
    , decodeUtf8Lenient
    , decodeUtf16LEWith
    , decodeUtf16BEWith
    , decodeUtf32LEWith
    , decodeUtf32BEWith

    -- ** Stream oriented decoding
    -- $stream
    , streamDecodeUtf8
    , streamDecodeUtf8With
    , Decoding(..)

    -- * Encoding Text to ByteStrings
    , encodeUtf8
    , encodeUtf16LE
    , encodeUtf16BE
    , encodeUtf32LE
    , encodeUtf32BE

    -- * Encoding Text using ByteString Builders
    , encodeUtf8Builder
    , encodeUtf8BuilderEscaped
    ) where

import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)

import Control.Exception (evaluate, try)
import Control.Monad.ST (runST, ST)
import Data.Bits (shiftR, (.&.))
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Short.Internal as SBS
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
import Data.Text.Internal (Text(..), safe, empty, append)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show as T (singleton)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8)
import Foreign.C.Types (CSize(..))
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (poke, peekByteOff)
import GHC.Exts (byteArrayContents#, unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Builder.Prim.Internal as BP
import Data.Text.Internal.Encoding.Utf8 (utf8DecodeStart, utf8DecodeContinue, DecoderResult(..))
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Fusion as E
import qualified Data.Text.Internal.Fusion as F
import Data.Text.Internal.ByteStringCompat
#if defined(ASSERTS)
import GHC.Stack (HasCallStack)
#endif

#ifdef SIMDUTF
import Foreign.C.Types (CInt(..))
#else
import qualified Data.ByteString.Unsafe as B
import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
#endif

-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
-- encoded in one of the Unicode Transformation Formats (UTF) operate
-- in a /strict/ mode: each will throw an exception if given invalid
-- input.
--
-- Each function has a variant, whose name is suffixed with -'With',
-- that gives greater control over the handling of decoding errors.
-- For instance, 'decodeUtf8' will throw an exception, but
-- 'decodeUtf8With' allows the programmer to determine what to do on a
-- decoding error.

-- | Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
--
-- This is a partial function: it checks that input does not contain
-- anything except ASCII and copies buffer or throws an error otherwise.
--
decodeASCII :: ByteString -> Text
decodeASCII :: ByteString -> Text
decodeASCII ByteString
bs = ByteString -> (ForeignPtr Word8 -> Int -> Text) -> Text
forall r. ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
withBS ByteString
bs ((ForeignPtr Word8 -> Int -> Text) -> Text)
-> (ForeignPtr Word8 -> Int -> Text) -> Text
forall a b. (a -> b) -> a -> b
$ \ForeignPtr Word8
fp Int
len -> if Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 then Text
empty else (forall s. ST s Text) -> Text
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s Text) -> Text) -> (forall s. ST s Text) -> Text
forall a b. (a -> b) -> a -> b
$ do
  Int
asciiPrefixLen <- (CSize -> Int) -> ST s CSize -> ST s Int
forall a b. (a -> b) -> ST s a -> ST s b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap CSize -> Int
cSizeToInt (ST s CSize -> ST s Int) -> ST s CSize -> ST s Int
forall a b. (a -> b) -> a -> b
$ IO CSize -> ST s CSize
forall a s. IO a -> ST s a
unsafeIOToST (IO CSize -> ST s CSize) -> IO CSize -> ST s CSize
forall a b. (a -> b) -> a -> b
$ ForeignPtr Word8 -> (Ptr Word8 -> IO CSize) -> IO CSize
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO CSize) -> IO CSize)
-> (Ptr Word8 -> IO CSize) -> IO CSize
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
src ->
    Ptr Word8 -> Ptr Word8 -> IO CSize
c_is_ascii Ptr Word8
src (Ptr Word8
src Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
len)
  if Int
asciiPrefixLen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len
  then let !(SBS.SBS ByteArray#
arr) = ByteString -> ShortByteString
SBS.toShort ByteString
bs in
        Text -> ST s Text
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return (Array -> Int -> Int -> Text
Text (ByteArray# -> Array
A.ByteArray ByteArray#
arr) Int
0 Int
len)
  else [Char] -> ST s Text
forall a. HasCallStack => [Char] -> a
error ([Char] -> ST s Text) -> [Char] -> ST s Text
forall a b. (a -> b) -> a -> b
$ [Char]
"decodeASCII: detected non-ASCII codepoint at " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
asciiPrefixLen

-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
--
-- 'decodeLatin1' is semantically equivalent to
--  @Data.Text.pack . Data.ByteString.Char8.unpack@
--
-- This is a total function. However, bear in mind that decoding Latin-1 (non-ASCII)
-- characters to UTf-8 requires actual work and is not just buffer copying.
--
decodeLatin1 ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  ByteString -> Text
decodeLatin1 :: ByteString -> Text
decodeLatin1 ByteString
bs = ByteString -> (ForeignPtr Word8 -> Int -> Text) -> Text
forall r. ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
withBS ByteString
bs ((ForeignPtr Word8 -> Int -> Text) -> Text)
-> (ForeignPtr Word8 -> Int -> Text) -> Text
forall a b. (a -> b) -> a -> b
$ \ForeignPtr Word8
fp Int
len -> (forall s. ST s Text) -> Text
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s Text) -> Text) -> (forall s. ST s Text) -> Text
forall a b. (a -> b) -> a -> b
$ do
  MArray s
dst <- Int -> ST s (MArray s)
forall s. Int -> ST s (MArray s)
A.new (Int
2 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
len)
  let inner :: Int -> Int -> ST s Int
inner Int
srcOff Int
dstOff = if Int
srcOff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
len then Int -> ST s Int
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
dstOff else do
        Int
asciiPrefixLen <- (CSize -> Int) -> ST s CSize -> ST s Int
forall a b. (a -> b) -> ST s a -> ST s b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap CSize -> Int
cSizeToInt (ST s CSize -> ST s Int) -> ST s CSize -> ST s Int
forall a b. (a -> b) -> a -> b
$ IO CSize -> ST s CSize
forall a s. IO a -> ST s a
unsafeIOToST (IO CSize -> ST s CSize) -> IO CSize -> ST s CSize
forall a b. (a -> b) -> a -> b
$ ForeignPtr Word8 -> (Ptr Word8 -> IO CSize) -> IO CSize
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO CSize) -> IO CSize)
-> (Ptr Word8 -> IO CSize) -> IO CSize
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
src ->
          Ptr Word8 -> Ptr Word8 -> IO CSize
c_is_ascii (Ptr Word8
src Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
srcOff) (Ptr Word8
src Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
len)
        if Int
asciiPrefixLen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
        then do
          Word8
byte <- IO Word8 -> ST s Word8
forall a s. IO a -> ST s a
unsafeIOToST (IO Word8 -> ST s Word8) -> IO Word8 -> ST s Word8
forall a b. (a -> b) -> a -> b
$ ForeignPtr Word8 -> (Ptr Word8 -> IO Word8) -> IO Word8
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO Word8) -> IO Word8)
-> (Ptr Word8 -> IO Word8) -> IO Word8
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
src -> Ptr Word8 -> Int -> IO Word8
forall b. Ptr b -> Int -> IO Word8
forall a b. Storable a => Ptr b -> Int -> IO a
peekByteOff Ptr Word8
src Int
srcOff
          MArray s -> Int -> Word8 -> ST s ()
forall s. MArray s -> Int -> Word8 -> ST s ()
A.unsafeWrite MArray s
dst Int
dstOff (Word8
0xC0 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ (Word8
byte Word8 -> Int -> Word8
forall a. Bits a => a -> Int -> a
`shiftR` Int
6))
          MArray s -> Int -> Word8 -> ST s ()
forall s. MArray s -> Int -> Word8 -> ST s ()
A.unsafeWrite MArray s
dst (Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ (Word8
byte Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
          Int -> Int -> ST s Int
inner (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)
        else do
          IO () -> ST s ()
forall a s. IO a -> ST s a
unsafeIOToST (IO () -> ST s ()) -> IO () -> ST s ()
forall a b. (a -> b) -> a -> b
$ ForeignPtr Word8 -> (Ptr Word8 -> IO ()) -> IO ()
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO ()) -> IO ()) -> (Ptr Word8 -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
src ->
            ST s () -> IO ()
forall s a. ST s a -> IO a
unsafeSTToIO (ST s () -> IO ()) -> ST s () -> IO ()
forall a b. (a -> b) -> a -> b
$ MArray s -> Int -> Ptr Word8 -> Int -> ST s ()
forall s. MArray s -> Int -> Ptr Word8 -> Int -> ST s ()
A.copyFromPointer MArray s
dst Int
dstOff (Ptr Word8
src Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
srcOff) Int
asciiPrefixLen
          Int -> Int -> ST s Int
inner (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
asciiPrefixLen) (Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
asciiPrefixLen)

  Int
actualLen <- Int -> Int -> ST s Int
inner Int
0 Int
0
  MArray s
dst' <- MArray s -> Int -> ST s (MArray s)
forall s. MArray s -> Int -> ST s (MArray s)
A.resizeM MArray s
dst Int
actualLen
  Array
arr <- MArray s -> ST s Array
forall s. MArray s -> ST s Array
A.unsafeFreeze MArray s
dst'
  Text -> ST s Text
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> ST s Text) -> Text -> ST s Text
forall a b. (a -> b) -> a -> b
$ Array -> Int -> Int -> Text
Text Array
arr Int
0 Int
actualLen

foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
    :: Ptr Word8 -> Ptr Word8 -> IO CSize

isValidBS :: ByteString -> Bool
#ifdef SIMDUTF
isValidBS :: ByteString -> Bool
isValidBS ByteString
bs = ByteString -> (ForeignPtr Word8 -> Int -> Bool) -> Bool
forall r. ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
withBS ByteString
bs ((ForeignPtr Word8 -> Int -> Bool) -> Bool)
-> (ForeignPtr Word8 -> Int -> Bool) -> Bool
forall a b. (a -> b) -> a -> b
$ \ForeignPtr Word8
fp Int
len -> IO Bool -> Bool
forall a. IO a -> a
unsafeDupablePerformIO (IO Bool -> Bool) -> IO Bool -> Bool
forall a b. (a -> b) -> a -> b
$
  ForeignPtr Word8 -> (Ptr Word8 -> IO Bool) -> IO Bool
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO Bool) -> IO Bool)
-> (Ptr Word8 -> IO Bool) -> IO Bool
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> (CInt -> CInt -> Bool
forall a. Eq a => a -> a -> Bool
/= CInt
0) (CInt -> Bool) -> IO CInt -> IO Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Ptr Word8 -> CSize -> IO CInt
c_is_valid_utf8 Ptr Word8
ptr (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len)
#else
#if MIN_VERSION_bytestring(0,11,2)
isValidBS = B.isValidUtf8
#else
isValidBS bs = start 0
  where
    start ix
      | ix >= B.length bs = True
      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
        Accept{} -> start (ix + 1)
        Reject{} -> False
        Incomplete st _ -> step (ix + 1) st
    step ix st
      | ix >= B.length bs = False
      -- We do not use decoded code point, so passing a dummy value to save an argument.
      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
        Accept{} -> start (ix + 1)
        Reject{} -> False
        Incomplete st' _ -> step (ix + 1) st'
#endif
#endif

-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- Surrogate code points in replacement character returned by 'OnDecodeError'
-- will be automatically remapped to the replacement char @U+FFFD@.
decodeUtf8With ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  OnDecodeError -> ByteString -> Text
decodeUtf8With :: OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
onErr ByteString
bs
  | ByteString -> Bool
isValidBS ByteString
bs =
    let !(SBS.SBS ByteArray#
arr) = ByteString -> ShortByteString
SBS.toShort ByteString
bs in
      (Array -> Int -> Int -> Text
Text (ByteArray# -> Array
A.ByteArray ByteArray#
arr) Int
0 (ByteString -> Int
B.length ByteString
bs))
  | ByteString -> Bool
B.null ByteString
undecoded = Text
txt
  | Bool
otherwise = Text
txt Text -> Text -> Text
`append` (case OnDecodeError
onErr [Char]
desc (Word8 -> Maybe Word8
forall a. a -> Maybe a
Just (HasCallStack => ByteString -> Word8
ByteString -> Word8
B.head ByteString
undecoded)) of
    Maybe Char
Nothing -> Text
txt'
    Just Char
c  -> Char -> Text
T.singleton Char
c Text -> Text -> Text
`append` Text
txt')
  where
    (Text
txt, ByteString
undecoded) = OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
decodeUtf8With2 OnDecodeError
onErr ByteString
forall a. Monoid a => a
mempty ByteString
bs
    txt' :: Text
txt' = OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
onErr (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
undecoded)
    desc :: [Char]
desc = [Char]
"Data.Text.Internal.Encoding: Invalid UTF-8 stream"

-- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
decodeUtf8With2 ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
decodeUtf8With2 :: OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
decodeUtf8With2 OnDecodeError
onErr bs1 :: ByteString
bs1@(ByteString -> Int
B.length -> Int
len1) bs2 :: ByteString
bs2@(ByteString -> Int
B.length -> Int
len2) = (forall s. ST s (Text, ByteString)) -> (Text, ByteString)
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Text, ByteString)) -> (Text, ByteString))
-> (forall s. ST s (Text, ByteString)) -> (Text, ByteString)
forall a b. (a -> b) -> a -> b
$ do
  MArray s
marr <- Int -> ST s (MArray s)
forall s. Int -> ST s (MArray s)
A.new Int
len'
  MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
forall s. MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
outer MArray s
marr Int
len' Int
0 Int
0
  where
    len :: Int
len = Int
len1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len2
    len' :: Int
len' = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4

    index :: Int -> Word8
index Int
i
      | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len1  = HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs1 Int
i
      | Bool
otherwise = HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs2 (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len1)

    -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
    guessUtf8Boundary :: Int
    guessUtf8Boundary :: Int
guessUtf8Boundary
      | Int
len2 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
1 Bool -> Bool -> Bool
&& Word8
w0 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<  Word8
0x80 = Int
len2     -- last char is ASCII
      | Int
len2 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
1 Bool -> Bool -> Bool
&& Word8
w0 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0xC0 = Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 -- last char starts a code point
      | Int
len2 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
2 Bool -> Bool -> Bool
&& Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0xC0 = Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2 -- pre-last char starts a code point
      | Int
len2 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
3 Bool -> Bool -> Bool
&& Word8
w2 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0xC0 = Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
3
      | Int
len2 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
4 Bool -> Bool -> Bool
&& Word8
w3 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word8
0xC0 = Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
4
      | Bool
otherwise = Int
0
      where
        w0 :: Word8
w0 = HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs2 (Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
        w1 :: Word8
w1 = HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs2 (Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2)
        w2 :: Word8
w2 = HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs2 (Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
3)
        w3 :: Word8
w3 = HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs2 (Int
len2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
4)

    decodeFrom :: Int -> DecoderResult
    decodeFrom :: Int -> DecoderResult
decodeFrom Int
off = Int -> DecoderResult -> DecoderResult
step (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Word8 -> DecoderResult
utf8DecodeStart (Int -> Word8
index Int
off))
      where
        step :: Int -> DecoderResult -> DecoderResult
step Int
i (Incomplete DecoderState
a CodePoint
b)
          | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len = Int -> DecoderResult -> DecoderResult
step (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Word8 -> DecoderState -> CodePoint -> DecoderResult
utf8DecodeContinue (Int -> Word8
index Int
i) DecoderState
a CodePoint
b)
        step Int
_ DecoderResult
st = DecoderResult
st

    outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
    outer :: forall s. MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
outer MArray s
dst Int
dstLen = Int -> Int -> ST s (Text, ByteString)
inner
        where
          inner :: Int -> Int -> ST s (Text, ByteString)
inner Int
srcOff Int
dstOff
            | Int
srcOff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
len = do
              MArray s -> Int -> ST s ()
forall s. MArray s -> Int -> ST s ()
A.shrinkM MArray s
dst Int
dstOff
              Array
arr <- MArray s -> ST s Array
forall s. MArray s -> ST s Array
A.unsafeFreeze MArray s
dst
              (Text, ByteString) -> ST s (Text, ByteString)
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return (Array -> Int -> Int -> Text
Text Array
arr Int
0 Int
dstOff, ByteString
forall a. Monoid a => a
mempty)

            | Int
srcOff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
len1
            , Int
srcOff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary
            , Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
len1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
srcOff) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
dstLen
            , ByteString
bs <- Int -> ByteString -> ByteString
B.drop (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len1) (Int -> ByteString -> ByteString
B.take Int
guessUtf8Boundary ByteString
bs2)
            , ByteString -> Bool
isValidBS ByteString
bs = do
              ByteString -> (ForeignPtr Word8 -> Int -> ST s ()) -> ST s ()
forall r. ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
withBS ByteString
bs ((ForeignPtr Word8 -> Int -> ST s ()) -> ST s ())
-> (ForeignPtr Word8 -> Int -> ST s ()) -> ST s ()
forall a b. (a -> b) -> a -> b
$ \ForeignPtr Word8
fp Int
_ -> IO () -> ST s ()
forall a s. IO a -> ST s a
unsafeIOToST (IO () -> ST s ()) -> IO () -> ST s ()
forall a b. (a -> b) -> a -> b
$ ForeignPtr Word8 -> (Ptr Word8 -> IO ()) -> IO ()
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
unsafeWithForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO ()) -> IO ()) -> (Ptr Word8 -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
src ->
                ST s () -> IO ()
forall s a. ST s a -> IO a
unsafeSTToIO (ST s () -> IO ()) -> ST s () -> IO ()
forall a b. (a -> b) -> a -> b
$ MArray s -> Int -> Ptr Word8 -> Int -> ST s ()
forall s. MArray s -> Int -> Ptr Word8 -> Int -> ST s ()
A.copyFromPointer MArray s
dst Int
dstOff Ptr Word8
src (Int
len1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
srcOff)
              Int -> Int -> ST s (Text, ByteString)
inner (Int
len1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary) (Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
len1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
srcOff))

            | Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
dstLen = do
              let dstLen' :: Int
dstLen' = Int
dstLen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4
              MArray s
dst' <- MArray s -> Int -> ST s (MArray s)
forall s. MArray s -> Int -> ST s (MArray s)
A.resizeM MArray s
dst Int
dstLen'
              MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
forall s. MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
outer MArray s
dst' Int
dstLen' Int
srcOff Int
dstOff

            | Bool
otherwise = case Int -> DecoderResult
decodeFrom Int
srcOff of
              Accept Char
c -> do
                Int
d <- MArray s -> Int -> Char -> ST s Int
forall s. MArray s -> Int -> Char -> ST s Int
unsafeWrite MArray s
dst Int
dstOff Char
c
                Int -> Int -> ST s (Text, ByteString)
inner (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d) (Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d)
              DecoderResult
Reject -> case OnDecodeError
onErr [Char]
desc (Word8 -> Maybe Word8
forall a. a -> Maybe a
Just (Int -> Word8
index Int
srcOff)) of
                Maybe Char
Nothing -> Int -> Int -> ST s (Text, ByteString)
inner (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Int
dstOff
                Just Char
c -> do
                  Int
d <- MArray s -> Int -> Char -> ST s Int
forall s. MArray s -> Int -> Char -> ST s Int
unsafeWrite MArray s
dst Int
dstOff (Char -> Char
safe Char
c)
                  Int -> Int -> ST s (Text, ByteString)
inner (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
dstOff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d)
              Incomplete{} -> do
                MArray s -> Int -> ST s ()
forall s. MArray s -> Int -> ST s ()
A.shrinkM MArray s
dst Int
dstOff
                Array
arr <- MArray s -> ST s Array
forall s. MArray s -> ST s Array
A.unsafeFreeze MArray s
dst
                let bs :: ByteString
bs = if Int
srcOff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
len1
                      then Int -> ByteString -> ByteString
B.drop (Int
srcOff Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len1) ByteString
bs2
                      else Int -> ByteString -> ByteString
B.drop Int
srcOff (ByteString
bs1 ByteString -> ByteString -> ByteString
`B.append` ByteString
bs2)
                (Text, ByteString) -> ST s (Text, ByteString)
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return (Array -> Int -> Int -> Text
Text Array
arr Int
0 Int
dstOff, ByteString
bs)

    desc :: [Char]
desc = [Char]
"Data.Text.Internal.Encoding: Invalid UTF-8 stream"

-- $stream
--
-- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept
-- a 'ByteString' that represents a possibly incomplete input (e.g. a
-- packet from a network stream) that may not end on a UTF-8 boundary.
--
-- 1. The maximal prefix of 'Text' that could be decoded from the
--    given input.
--
-- 2. The suffix of the 'ByteString' that could not be decoded due to
--    insufficient input.
--
-- 3. A function that accepts another 'ByteString'.  That string will
--    be assumed to directly follow the string that was passed as
--    input to the original function, and it will in turn be decoded.
--
-- To help understand the use of these functions, consider the Unicode
-- string @\"hi &#9731;\"@. If encoded as UTF-8, this becomes @\"hi
-- \\xe2\\x98\\x83\"@; the final @\'&#9731;\'@ is encoded as 3 bytes.
--
-- Now suppose that we receive this encoded string as 3 packets that
-- are split up on untidy boundaries: @[\"hi \\xe2\", \"\\x98\",
-- \"\\x83\"]@. We cannot decode the entire Unicode string until we
-- have received all three packets, but we would like to make progress
-- as we receive each one.
--
-- @
-- ghci> let s0\@('Some' _ _ f0) = 'streamDecodeUtf8' \"hi \\xe2\"
-- ghci> s0
-- 'Some' \"hi \" \"\\xe2\" _
-- @
--
-- We use the continuation @f0@ to decode our second packet.
--
-- @
-- ghci> let s1\@('Some' _ _ f1) = f0 \"\\x98\"
-- ghci> s1
-- 'Some' \"\" \"\\xe2\\x98\"
-- @
--
-- We could not give @f0@ enough input to decode anything, so it
-- returned an empty string. Once we feed our second continuation @f1@
-- the last byte of input, it will make progress.
--
-- @
-- ghci> let s2\@('Some' _ _ f2) = f1 \"\\x83\"
-- ghci> s2
-- 'Some' \"\\x2603\" \"\" _
-- @
--
-- If given invalid input, an exception will be thrown by the function
-- or continuation where it is encountered.

-- | A stream oriented decoding result.
--
-- @since 1.0.0.0
data Decoding = Some !Text !ByteString (ByteString -> Decoding)

instance Show Decoding where
    showsPrec :: Int -> Decoding -> [Char] -> [Char]
showsPrec Int
d (Some Text
t ByteString
bs ByteString -> Decoding
_) = Bool -> ([Char] -> [Char]) -> [Char] -> [Char]
showParen (Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
prec) (([Char] -> [Char]) -> [Char] -> [Char])
-> ([Char] -> [Char]) -> [Char] -> [Char]
forall a b. (a -> b) -> a -> b
$
                                [Char] -> [Char] -> [Char]
showString [Char]
"Some " ([Char] -> [Char]) -> ([Char] -> [Char]) -> [Char] -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> [Char] -> [Char]
forall a. Show a => Int -> a -> [Char] -> [Char]
showsPrec Int
prec' Text
t ([Char] -> [Char]) -> ([Char] -> [Char]) -> [Char] -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
                                Char -> [Char] -> [Char]
showChar Char
' ' ([Char] -> [Char]) -> ([Char] -> [Char]) -> [Char] -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> [Char] -> [Char]
forall a. Show a => Int -> a -> [Char] -> [Char]
showsPrec Int
prec' ByteString
bs ([Char] -> [Char]) -> ([Char] -> [Char]) -> [Char] -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
                                [Char] -> [Char] -> [Char]
showString [Char]
" _"
      where prec :: Int
prec = Int
10; prec' :: Int
prec' = Int
prec Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1

-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text that is known to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown (either by this function or a continuation) that cannot be
-- caught in pure code.  For more control over the handling of invalid
-- data, use 'streamDecodeUtf8With'.
--
-- @since 1.0.0.0
streamDecodeUtf8 ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  ByteString -> Decoding
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With OnDecodeError
strictDecode

-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
-- encoded text.
--
-- @since 1.0.0.0
streamDecodeUtf8With ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With OnDecodeError
onErr = ByteString -> ByteString -> Decoding
go ByteString
forall a. Monoid a => a
mempty
  where
    go :: ByteString -> ByteString -> Decoding
go ByteString
bs1 ByteString
bs2 = Text -> ByteString -> (ByteString -> Decoding) -> Decoding
Some Text
txt ByteString
undecoded (ByteString -> ByteString -> Decoding
go ByteString
undecoded)
      where
        (Text
txt, ByteString
undecoded) = OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
decodeUtf8With2 OnDecodeError
onErr ByteString
bs1 ByteString
bs2

-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown that cannot be caught in pure code.  For more control over
-- the handling of invalid data, use 'decodeUtf8'' or
-- 'decodeUtf8With'.
--
-- This is a partial function: it checks that input is a well-formed
-- UTF-8 sequence and copies buffer or throws an error otherwise.
--
decodeUtf8 :: ByteString -> Text
decodeUtf8 :: ByteString -> Text
decodeUtf8 = OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
strictDecode
{-# INLINE[0] decodeUtf8 #-}

-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- If the input contains any invalid UTF-8 data, the relevant
-- exception will be returned, otherwise the decoded text.
decodeUtf8' ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  ByteString -> Either UnicodeException Text
decodeUtf8' :: ByteString -> Either UnicodeException Text
decodeUtf8' = IO (Either UnicodeException Text) -> Either UnicodeException Text
forall a. IO a -> a
unsafeDupablePerformIO (IO (Either UnicodeException Text) -> Either UnicodeException Text)
-> (ByteString -> IO (Either UnicodeException Text))
-> ByteString
-> Either UnicodeException Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IO Text -> IO (Either UnicodeException Text)
forall e a. Exception e => IO a -> IO (Either e a)
try (IO Text -> IO (Either UnicodeException Text))
-> (ByteString -> IO Text)
-> ByteString
-> IO (Either UnicodeException Text)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> IO Text
forall a. a -> IO a
evaluate (Text -> IO Text) -> (ByteString -> Text) -> ByteString -> IO Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
strictDecode
{-# INLINE decodeUtf8' #-}

-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- Any invalid input bytes will be replaced with the Unicode replacement
-- character U+FFFD.
decodeUtf8Lenient :: ByteString -> Text
decodeUtf8Lenient :: ByteString -> Text
decodeUtf8Lenient = OnDecodeError -> ByteString -> Text
decodeUtf8With OnDecodeError
lenientDecode

-- | Encode text to a ByteString 'B.Builder' using UTF-8 encoding.
--
-- @since 1.1.0.0
encodeUtf8Builder :: Text -> B.Builder
encodeUtf8Builder :: Text -> Builder
encodeUtf8Builder =
    -- manual eta-expansion to ensure inlining works as expected
    \Text
txt -> (forall r. BuildStep r -> BuildStep r) -> Builder
B.builder (Text
-> (BufferRange -> IO (BuildSignal r))
-> BufferRange
-> IO (BuildSignal r)
forall {a}.
Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
step Text
txt)
  where
    step :: Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
step txt :: Text
txt@(Text Array
arr Int
off Int
len) !BufferRange -> IO (BuildSignal a)
k br :: BufferRange
br@(B.BufferRange Ptr Word8
op Ptr Word8
ope)
      -- Ensure that the common case is not recursive and therefore yields
      -- better code.
      | Ptr Word8
forall {b}. Ptr b
op' Ptr Word8 -> Ptr Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Ptr Word8
ope = do
          ST Any () -> IO ()
forall s a. ST s a -> IO a
unsafeSTToIO (ST Any () -> IO ()) -> ST Any () -> IO ()
forall a b. (a -> b) -> a -> b
$ Array -> Int -> Ptr Word8 -> Int -> ST Any ()
forall s. Array -> Int -> Ptr Word8 -> Int -> ST s ()
A.copyToPointer Array
arr Int
off Ptr Word8
op Int
len
          BufferRange -> IO (BuildSignal a)
k (Ptr Word8 -> Ptr Word8 -> BufferRange
B.BufferRange Ptr Word8
forall {b}. Ptr b
op' Ptr Word8
ope)
      | Bool
otherwise = Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
forall {a}.
Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
textCopyStep Text
txt BufferRange -> IO (BuildSignal a)
k BufferRange
br
      where
        op' :: Ptr b
op' = Ptr Word8
op Ptr Word8 -> Int -> Ptr b
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
len
{-# INLINE encodeUtf8Builder #-}

textCopyStep :: Text -> B.BuildStep a -> B.BuildStep a
textCopyStep :: forall {a}.
Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
textCopyStep (Text Array
arr Int
off Int
len) BuildStep a
k =
    Int -> Int -> BuildStep a
go Int
off (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len)
  where
    go :: Int -> Int -> BuildStep a
go !Int
ip !Int
ipe (B.BufferRange Ptr Word8
op Ptr Word8
ope)
      | Int
inpRemaining Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
outRemaining = do
          ST Any () -> IO ()
forall s a. ST s a -> IO a
unsafeSTToIO (ST Any () -> IO ()) -> ST Any () -> IO ()
forall a b. (a -> b) -> a -> b
$ Array -> Int -> Ptr Word8 -> Int -> ST Any ()
forall s. Array -> Int -> Ptr Word8 -> Int -> ST s ()
A.copyToPointer Array
arr Int
ip Ptr Word8
op Int
inpRemaining
          let !br :: BufferRange
br = Ptr Word8 -> Ptr Word8 -> BufferRange
B.BufferRange (Ptr Word8
op Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
inpRemaining) Ptr Word8
ope
          BuildStep a
k BufferRange
br
      | Bool
otherwise = do
          ST Any () -> IO ()
forall s a. ST s a -> IO a
unsafeSTToIO (ST Any () -> IO ()) -> ST Any () -> IO ()
forall a b. (a -> b) -> a -> b
$ Array -> Int -> Ptr Word8 -> Int -> ST Any ()
forall s. Array -> Int -> Ptr Word8 -> Int -> ST s ()
A.copyToPointer Array
arr Int
ip Ptr Word8
op Int
outRemaining
          let !ip' :: Int
ip' = Int
ip Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
outRemaining
          BuildSignal a -> IO (BuildSignal a)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (BuildSignal a -> IO (BuildSignal a))
-> BuildSignal a -> IO (BuildSignal a)
forall a b. (a -> b) -> a -> b
$ Int -> Ptr Word8 -> BuildStep a -> BuildSignal a
forall a. Int -> Ptr Word8 -> BuildStep a -> BuildSignal a
B.bufferFull Int
1 Ptr Word8
ope (Int -> Int -> BuildStep a
go Int
ip' Int
ipe)
      where
        outRemaining :: Int
outRemaining = Ptr Word8
ope Ptr Word8 -> Ptr Word8 -> Int
forall a b. Ptr a -> Ptr b -> Int
`minusPtr` Ptr Word8
op
        inpRemaining :: Int
inpRemaining = Int
ipe Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ip

-- | Encode text using UTF-8 encoding and escape the ASCII characters using
-- a 'BP.BoundedPrim'.
--
-- Use this function is to implement efficient encoders for text-based formats
-- like JSON or HTML.
--
-- @since 1.1.0.0
{-# INLINE encodeUtf8BuilderEscaped #-}
-- TODO: Extend documentation with references to source code in @blaze-html@
-- or @aeson@ that uses this function.
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped :: BoundedPrim Word8 -> Text -> Builder
encodeUtf8BuilderEscaped BoundedPrim Word8
be =
    -- manual eta-expansion to ensure inlining works as expected
    \Text
txt -> (forall r. BuildStep r -> BuildStep r) -> Builder
B.builder (Text
-> (BufferRange -> IO (BuildSignal r))
-> BufferRange
-> IO (BuildSignal r)
forall {a}.
Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
mkBuildstep Text
txt)
  where
    bound :: Int
bound = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
4 (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ BoundedPrim Word8 -> Int
forall a. BoundedPrim a -> Int
BP.sizeBound BoundedPrim Word8
be

    mkBuildstep :: Text
-> (BufferRange -> IO (BuildSignal a))
-> BufferRange
-> IO (BuildSignal a)
mkBuildstep (Text Array
arr Int
off Int
len) !BufferRange -> IO (BuildSignal a)
k =
        Int -> BufferRange -> IO (BuildSignal a)
outerLoop Int
off
      where
        iend :: Int
iend = Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len

        outerLoop :: Int -> BufferRange -> IO (BuildSignal a)
outerLoop !Int
i0 !br :: BufferRange
br@(B.BufferRange Ptr Word8
op0 Ptr Word8
ope)
          | Int
i0 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
iend       = BufferRange -> IO (BuildSignal a)
k BufferRange
br
          | Int
outRemaining Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 = Int -> IO (BuildSignal a)
goPartial (Int
i0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
outRemaining Int
inpRemaining)
          -- TODO: Use a loop with an integrated bound's check if outRemaining
          -- is smaller than 8, as this will save on divisions.
          | Bool
otherwise        = BuildSignal a -> IO (BuildSignal a)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (BuildSignal a -> IO (BuildSignal a))
-> BuildSignal a -> IO (BuildSignal a)
forall a b. (a -> b) -> a -> b
$ Int
-> Ptr Word8
-> (BufferRange -> IO (BuildSignal a))
-> BuildSignal a
forall a. Int -> Ptr Word8 -> BuildStep a -> BuildSignal a
B.bufferFull Int
bound Ptr Word8
op0 (Int -> BufferRange -> IO (BuildSignal a)
outerLoop Int
i0)
          where
            outRemaining :: Int
outRemaining = (Ptr Word8
ope Ptr Word8 -> Ptr Word8 -> Int
forall a b. Ptr a -> Ptr b -> Int
`minusPtr` Ptr Word8
op0) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`quot` Int
bound
            inpRemaining :: Int
inpRemaining = Int
iend Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i0

            goPartial :: Int -> IO (BuildSignal a)
goPartial !Int
iendTmp = Int -> Ptr Word8 -> IO (BuildSignal a)
go Int
i0 Ptr Word8
op0
              where
                go :: Int -> Ptr Word8 -> IO (BuildSignal a)
go !Int
i !Ptr Word8
op
                  | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
iendTmp = do
                    let w :: Word8
w = Array -> Int -> Word8
A.unsafeIndex Array
arr Int
i
                    if Word8
w Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80
                      then BoundedPrim Word8 -> Word8 -> Ptr Word8 -> IO (Ptr Word8)
forall a. BoundedPrim a -> a -> Ptr Word8 -> IO (Ptr Word8)
BP.runB BoundedPrim Word8
be Word8
w Ptr Word8
op IO (Ptr Word8)
-> (Ptr Word8 -> IO (BuildSignal a)) -> IO (BuildSignal a)
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Int -> Ptr Word8 -> IO (BuildSignal a)
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                      else Ptr Word8 -> Word8 -> IO ()
forall a. Storable a => Ptr a -> a -> IO ()
poke Ptr Word8
op Word8
w IO () -> IO (BuildSignal a) -> IO (BuildSignal a)
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Int -> Ptr Word8 -> IO (BuildSignal a)
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Ptr Word8
op Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
1)
                  | Bool
otherwise = Int -> BufferRange -> IO (BuildSignal a)
outerLoop Int
i (Ptr Word8 -> Ptr Word8 -> BufferRange
B.BufferRange Ptr Word8
op Ptr Word8
ope)

-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> ByteString
encodeUtf8 :: Text -> ByteString
encodeUtf8 (Text Array
arr Int
off Int
len)
  | Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0  = ByteString
B.empty
  -- It would be easier to use Data.ByteString.Short.fromShort and slice later,
  -- but this is undesirable when len is significantly smaller than length arr.
  | Bool
otherwise = IO ByteString -> ByteString
forall a. IO a -> a
unsafeDupablePerformIO (IO ByteString -> ByteString) -> IO ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ do
    marr :: MArray RealWorld
marr@(A.MutableByteArray MutableByteArray# RealWorld
mba) <- ST RealWorld (MArray RealWorld) -> IO (MArray RealWorld)
forall s a. ST s a -> IO a
unsafeSTToIO (ST RealWorld (MArray RealWorld) -> IO (MArray RealWorld))
-> ST RealWorld (MArray RealWorld) -> IO (MArray RealWorld)
forall a b. (a -> b) -> a -> b
$ Int -> ST RealWorld (MArray RealWorld)
forall s. Int -> ST s (MArray s)
A.newPinned Int
len
    ST RealWorld () -> IO ()
forall s a. ST s a -> IO a
unsafeSTToIO (ST RealWorld () -> IO ()) -> ST RealWorld () -> IO ()
forall a b. (a -> b) -> a -> b
$ Int -> MArray RealWorld -> Int -> Array -> Int -> ST RealWorld ()
forall s. Int -> MArray s -> Int -> Array -> Int -> ST s ()
A.copyI Int
len MArray RealWorld
marr Int
0 Array
arr Int
off
    let fp :: ForeignPtr a
fp = Addr# -> ForeignPtrContents -> ForeignPtr a
forall a. Addr# -> ForeignPtrContents -> ForeignPtr a
ForeignPtr (ByteArray# -> Addr#
byteArrayContents# (MutableByteArray# RealWorld -> ByteArray#
forall a b. a -> b
unsafeCoerce# MutableByteArray# RealWorld
mba))
                        (MutableByteArray# RealWorld -> ForeignPtrContents
PlainPtr MutableByteArray# RealWorld
mba)
    ByteString -> IO ByteString
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ByteString -> IO ByteString) -> ByteString -> IO ByteString
forall a b. (a -> b) -> a -> b
$ ForeignPtr Word8 -> Int -> Int -> ByteString
B.fromForeignPtr ForeignPtr Word8
forall {a}. ForeignPtr a
fp Int
0 Int
len

-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16LEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf16LE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf16LEWith #-}

-- | Decode text from little endian UTF-16 encoding.
--
-- If the input contains any invalid little endian UTF-16 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf16LEWith'.
decodeUtf16LE :: ByteString -> Text
decodeUtf16LE :: ByteString -> Text
decodeUtf16LE = OnDecodeError -> ByteString -> Text
decodeUtf16LEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf16LE #-}

-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16BEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf16BE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf16BEWith #-}

-- | Decode text from big endian UTF-16 encoding.
--
-- If the input contains any invalid big endian UTF-16 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf16BEWith'.
decodeUtf16BE :: ByteString -> Text
decodeUtf16BE :: ByteString -> Text
decodeUtf16BE = OnDecodeError -> ByteString -> Text
decodeUtf16BEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf16BE #-}

-- | Encode text using little endian UTF-16 encoding.
encodeUtf16LE :: Text -> ByteString
encodeUtf16LE :: Text -> ByteString
encodeUtf16LE Text
txt = Stream Word8 -> ByteString
E.unstream (Stream Char -> Stream Word8
E.restreamUtf16LE (Text -> Stream Char
F.stream Text
txt))
{-# INLINE encodeUtf16LE #-}

-- | Encode text using big endian UTF-16 encoding.
encodeUtf16BE :: Text -> ByteString
encodeUtf16BE :: Text -> ByteString
encodeUtf16BE Text
txt = Stream Word8 -> ByteString
E.unstream (Stream Char -> Stream Word8
E.restreamUtf16BE (Text -> Stream Char
F.stream Text
txt))
{-# INLINE encodeUtf16BE #-}

-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32LEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf32LE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf32LEWith #-}

-- | Decode text from little endian UTF-32 encoding.
--
-- If the input contains any invalid little endian UTF-32 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf32LEWith'.
decodeUtf32LE :: ByteString -> Text
decodeUtf32LE :: ByteString -> Text
decodeUtf32LE = OnDecodeError -> ByteString -> Text
decodeUtf32LEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf32LE #-}

-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32BEWith OnDecodeError
onErr ByteString
bs = Stream Char -> Text
F.unstream (OnDecodeError -> ByteString -> Stream Char
E.streamUtf32BE OnDecodeError
onErr ByteString
bs)
{-# INLINE decodeUtf32BEWith #-}

-- | Decode text from big endian UTF-32 encoding.
--
-- If the input contains any invalid big endian UTF-32 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf32BEWith'.
decodeUtf32BE :: ByteString -> Text
decodeUtf32BE :: ByteString -> Text
decodeUtf32BE = OnDecodeError -> ByteString -> Text
decodeUtf32BEWith OnDecodeError
strictDecode
{-# INLINE decodeUtf32BE #-}

-- | Encode text using little endian UTF-32 encoding.
encodeUtf32LE :: Text -> ByteString
encodeUtf32LE :: Text -> ByteString
encodeUtf32LE Text
txt = Stream Word8 -> ByteString
E.unstream (Stream Char -> Stream Word8
E.restreamUtf32LE (Text -> Stream Char
F.stream Text
txt))
{-# INLINE encodeUtf32LE #-}

-- | Encode text using big endian UTF-32 encoding.
encodeUtf32BE :: Text -> ByteString
encodeUtf32BE :: Text -> ByteString
encodeUtf32BE Text
txt = Stream Word8 -> ByteString
E.unstream (Stream Char -> Stream Word8
E.restreamUtf32BE (Text -> Stream Char
F.stream Text
txt))
{-# INLINE encodeUtf32BE #-}

cSizeToInt :: CSize -> Int
cSizeToInt :: CSize -> Int
cSizeToInt = CSize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

#ifdef SIMDUTF
foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
    :: Ptr Word8 -> CSize -> IO CInt
#endif