{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-}
{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}
-- We always optimise this, otherwise performance of a non-optimised
-- compiler is severely affected. This module used to live in the `ghc`
-- package but has been moved to `ghc-boot` because the definition
-- of the package database (needed in both ghc and in ghc-pkg) lives in
-- `ghc-boot` and uses ShortText, which in turn depends on this module.

-- | Simple, non-streaming Modified UTF-8 codecs.
--
-- This is one of several UTF-8 implementations provided by GHC; see Note
-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an
-- overview.
--
module GHC.Utils.Encoding.UTF8
    ( -- * Decoding single characters
      utf8DecodeCharAddr#
    , utf8DecodeCharPtr
    , utf8DecodeCharByteArray#
    , utf8PrevChar
    , utf8CharStart
    , utf8UnconsByteString
      -- * Decoding strings
    , utf8DecodeByteString
    , utf8DecodeShortByteString
    , utf8DecodeForeignPtr
    , utf8DecodeByteArray#
      -- * Counting characters
    , utf8CountCharsShortByteString
    , utf8CountCharsByteArray#
      -- * Comparison
    , utf8CompareByteArray#
    , utf8CompareShortByteString
      -- * Encoding strings
    , utf8EncodeByteArray#
    , utf8EncodePtr
    , utf8EncodeByteString
    , utf8EncodeShortByteString
    , utf8EncodedLength
    ) where


import Prelude

import Foreign
import GHC.IO
import GHC.Encoding.UTF8

import Data.ByteString (ByteString)
import qualified Data.ByteString.Internal as BS
import Data.ByteString.Short.Internal (ShortByteString(..))

-- | Find the start of the codepoint preceding the codepoint at the given
-- 'Ptr'. This is undefined if there is no previous valid codepoint.
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar Ptr Word8
p = Ptr Word8 -> IO (Ptr Word8)
utf8CharStart (Ptr Word8
p Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` (-Int
1))

-- | Find the start of the codepoint at the given 'Ptr'. This is undefined if
-- there is no previous valid codepoint.
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart Ptr Word8
p = Ptr Word8 -> IO (Ptr Word8)
forall {b}. (Storable b, Ord b, Num b) => Ptr b -> IO (Ptr b)
go Ptr Word8
p
 where go :: Ptr b -> IO (Ptr b)
go Ptr b
p = do w <- Ptr b -> IO b
forall a. Storable a => Ptr a -> IO a
peek Ptr b
p
                 if w >= 0x80 && w < 0xC0
                        then go (p `plusPtr` (-1))
                        else return p

utf8CountCharsShortByteString :: ShortByteString -> Int
utf8CountCharsShortByteString :: ShortByteString -> Int
utf8CountCharsShortByteString (SBS ByteArray#
ba) = ByteArray# -> Int
utf8CountCharsByteArray# ByteArray#
ba

utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString :: ShortByteString -> [Char]
utf8DecodeShortByteString (SBS ByteArray#
ba#) = ByteArray# -> [Char]
utf8DecodeByteArray# ByteArray#
ba#

-- | Decode a 'ByteString' containing a UTF-8 string.
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS ForeignPtr Word8
fptr Int
offset Int
len)
  = ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr ForeignPtr Word8
fptr Int
offset Int
len

utf8EncodeShortByteString :: String -> ShortByteString
utf8EncodeShortByteString :: [Char] -> ShortByteString
utf8EncodeShortByteString [Char]
str = ByteArray# -> ShortByteString
SBS ([Char] -> ByteArray#
utf8EncodeByteArray# [Char]
str)

-- | Encode a 'String' into a 'ByteString'.
utf8EncodeByteString :: String -> ByteString
utf8EncodeByteString :: [Char] -> ByteString
utf8EncodeByteString [Char]
s =
  IO ByteString -> ByteString
forall a. IO a -> a
unsafePerformIO (IO ByteString -> ByteString) -> IO ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ do
    let len :: Int
len = [Char] -> Int
utf8EncodedLength [Char]
s
    buf <- Int -> IO (ForeignPtr Word8)
forall a. Int -> IO (ForeignPtr a)
mallocForeignPtrBytes Int
len
    withForeignPtr buf $ \Ptr Word8
ptr -> do
      Ptr Word8 -> [Char] -> IO ()
utf8EncodePtr Ptr Word8
ptr [Char]
s
      ByteString -> IO ByteString
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ForeignPtr Word8 -> Int -> Int -> ByteString
BS.fromForeignPtr ForeignPtr Word8
buf Int
0 Int
len)

utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
utf8UnconsByteString (BS.PS ForeignPtr Word8
_ Int
_ Int
0) = Maybe (Char, ByteString)
forall a. Maybe a
Nothing
utf8UnconsByteString (BS.PS ForeignPtr Word8
fptr Int
offset Int
len)
  = IO (Maybe (Char, ByteString)) -> Maybe (Char, ByteString)
forall a. IO a -> a
unsafeDupablePerformIO (IO (Maybe (Char, ByteString)) -> Maybe (Char, ByteString))
-> IO (Maybe (Char, ByteString)) -> Maybe (Char, ByteString)
forall a b. (a -> b) -> a -> b
$
      ForeignPtr Word8
-> (Ptr Word8 -> IO (Maybe (Char, ByteString)))
-> IO (Maybe (Char, ByteString))
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
fptr ((Ptr Word8 -> IO (Maybe (Char, ByteString)))
 -> IO (Maybe (Char, ByteString)))
-> (Ptr Word8 -> IO (Maybe (Char, ByteString)))
-> IO (Maybe (Char, ByteString))
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> do
        let (Char
c,Int
n) = Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr (Ptr Word8
ptr Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
offset)
        Maybe (Char, ByteString) -> IO (Maybe (Char, ByteString))
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe (Char, ByteString) -> IO (Maybe (Char, ByteString)))
-> Maybe (Char, ByteString) -> IO (Maybe (Char, ByteString))
forall a b. (a -> b) -> a -> b
$ (Char, ByteString) -> Maybe (Char, ByteString)
forall a. a -> Maybe a
Just (Char
c, ForeignPtr Word8 -> Int -> Int -> ByteString
BS.PS ForeignPtr Word8
fptr (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
n) (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n))

utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString :: ShortByteString -> ShortByteString -> Ordering
utf8CompareShortByteString (SBS ByteArray#
a1) (SBS ByteArray#
a2) = ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# ByteArray#
a1 ByteArray#
a2