{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE Trustworthy #-}

{-# OPTIONS_HADDOCK not-home #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  GHC.Internal.Unicode
-- Copyright   :  (c) The University of Glasgow, 2003
-- License     :  see libraries/base/LICENSE
--
-- Maintainer  :  ghc-devs@haskell.org
-- Stability   :  internal
-- Portability :  non-portable (GHC extensions)
--
-- Implementations for the character predicates (isLower, isUpper, etc.)
-- and the conversions (toUpper, toLower).  The implementation uses
-- libunicode on Unix systems if that is available.
--
-----------------------------------------------------------------------------

module GHC.Internal.Unicode (
        unicodeVersion,
        GeneralCategory (..), generalCategory,
        isAscii, isLatin1, isControl,
        isAsciiUpper, isAsciiLower,
        isPrint, isSpace, isUpper, isUpperCase,
        isLower, isLowerCase, isAlpha, isDigit,
        isOctDigit, isHexDigit, isAlphaNum,
        isPunctuation, isSymbol,
        toUpper, toLower, toTitle
    ) where

import GHC.Internal.Base
import GHC.Internal.Real
import GHC.Internal.Enum ( Enum (..), Bounded (..) )
import GHC.Internal.Ix ( Ix (..) )
import GHC.Internal.Num
import GHC.Internal.Unicode.Version
import qualified GHC.Internal.Unicode.Char.DerivedCoreProperties as DCP
import qualified GHC.Internal.Unicode.Char.UnicodeData.GeneralCategory as GC
import qualified GHC.Internal.Unicode.Char.UnicodeData.SimpleLowerCaseMapping as C
import qualified GHC.Internal.Unicode.Char.UnicodeData.SimpleTitleCaseMapping as C
import qualified GHC.Internal.Unicode.Char.UnicodeData.SimpleUpperCaseMapping as C

-- Data.Char.chr already imports this and we need to define a Show instance
-- for GeneralCategory
import GHC.Internal.Show ( Show )

-- $setup
-- >>> import Prelude

-- [NOTE] The constructors of 'GeneralCategory' must be in the same order they
-- are listed in the Unicode Standard, because some functions
-- (e.g. 'generalCategory') rely on the 'Enum' instance.

-- | Unicode General Categories (column 2 of the UnicodeData table) in
-- the order they are listed in the Unicode standard (the Unicode
-- Character Database, in particular).
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> :t OtherLetter
-- OtherLetter :: GeneralCategory
--
-- 'Eq' instance:
--
-- >>> UppercaseLetter == UppercaseLetter
-- True
-- >>> UppercaseLetter == LowercaseLetter
-- False
--
-- 'Ord' instance:
--
-- >>> NonSpacingMark <= MathSymbol
-- True
--
-- 'Enum' instance:
--
-- >>> enumFromTo ModifierLetter SpacingCombiningMark
-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
--
-- 'Text.Read.Read' instance:
--
-- >>> read "DashPunctuation" :: GeneralCategory
-- DashPunctuation
-- >>> read "17" :: GeneralCategory
-- *** Exception: Prelude.read: no parse
--
-- 'Show' instance:
--
-- >>> show EnclosingMark
-- "EnclosingMark"
--
-- 'Bounded' instance:
--
-- >>> minBound :: GeneralCategory
-- UppercaseLetter
-- >>> maxBound :: GeneralCategory
-- NotAssigned
--
-- 'Ix' instance:
--
--  >>> import GHC.Internal.Data.Ix ( index )
--  >>> index (OtherLetter,Control) FinalQuote
--  12
--  >>> index (OtherLetter,Control) Format
--  *** Exception: Error in array index
--
data GeneralCategory
        = UppercaseLetter       -- ^ Lu: Letter, Uppercase
        | LowercaseLetter       -- ^ Ll: Letter, Lowercase
        | TitlecaseLetter       -- ^ Lt: Letter, Titlecase
        | ModifierLetter        -- ^ Lm: Letter, Modifier
        | OtherLetter           -- ^ Lo: Letter, Other
        | NonSpacingMark        -- ^ Mn: Mark, Non-Spacing
        | SpacingCombiningMark  -- ^ Mc: Mark, Spacing Combining
        | EnclosingMark         -- ^ Me: Mark, Enclosing
        | DecimalNumber         -- ^ Nd: Number, Decimal
        | LetterNumber          -- ^ Nl: Number, Letter
        | OtherNumber           -- ^ No: Number, Other
        | ConnectorPunctuation  -- ^ Pc: Punctuation, Connector
        | DashPunctuation       -- ^ Pd: Punctuation, Dash
        | OpenPunctuation       -- ^ Ps: Punctuation, Open
        | ClosePunctuation      -- ^ Pe: Punctuation, Close
        | InitialQuote          -- ^ Pi: Punctuation, Initial quote
        | FinalQuote            -- ^ Pf: Punctuation, Final quote
        | OtherPunctuation      -- ^ Po: Punctuation, Other
        | MathSymbol            -- ^ Sm: Symbol, Math
        | CurrencySymbol        -- ^ Sc: Symbol, Currency
        | ModifierSymbol        -- ^ Sk: Symbol, Modifier
        | OtherSymbol           -- ^ So: Symbol, Other
        | Space                 -- ^ Zs: Separator, Space
        | LineSeparator         -- ^ Zl: Separator, Line
        | ParagraphSeparator    -- ^ Zp: Separator, Paragraph
        | Control               -- ^ Cc: Other, Control
        | Format                -- ^ Cf: Other, Format
        | Surrogate             -- ^ Cs: Other, Surrogate
        | PrivateUse            -- ^ Co: Other, Private Use
        | NotAssigned           -- ^ Cn: Other, Not Assigned
        deriving ( Int -> GeneralCategory -> ShowS
[GeneralCategory] -> ShowS
GeneralCategory -> String
(Int -> GeneralCategory -> ShowS)
-> (GeneralCategory -> String)
-> ([GeneralCategory] -> ShowS)
-> Show GeneralCategory
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GeneralCategory -> ShowS
showsPrec :: Int -> GeneralCategory -> ShowS
$cshow :: GeneralCategory -> String
show :: GeneralCategory -> String
$cshowList :: [GeneralCategory] -> ShowS
showList :: [GeneralCategory] -> ShowS
Show     -- ^ @since base-2.01
                 , GeneralCategory -> GeneralCategory -> Bool
(GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> Eq GeneralCategory
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: GeneralCategory -> GeneralCategory -> Bool
== :: GeneralCategory -> GeneralCategory -> Bool
$c/= :: GeneralCategory -> GeneralCategory -> Bool
/= :: GeneralCategory -> GeneralCategory -> Bool
Eq       -- ^ @since base-2.01
                 , Eq GeneralCategory
Eq GeneralCategory =>
(GeneralCategory -> GeneralCategory -> Ordering)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> GeneralCategory)
-> (GeneralCategory -> GeneralCategory -> GeneralCategory)
-> Ord GeneralCategory
GeneralCategory -> GeneralCategory -> Bool
GeneralCategory -> GeneralCategory -> Ordering
GeneralCategory -> GeneralCategory -> GeneralCategory
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: GeneralCategory -> GeneralCategory -> Ordering
compare :: GeneralCategory -> GeneralCategory -> Ordering
$c< :: GeneralCategory -> GeneralCategory -> Bool
< :: GeneralCategory -> GeneralCategory -> Bool
$c<= :: GeneralCategory -> GeneralCategory -> Bool
<= :: GeneralCategory -> GeneralCategory -> Bool
$c> :: GeneralCategory -> GeneralCategory -> Bool
> :: GeneralCategory -> GeneralCategory -> Bool
$c>= :: GeneralCategory -> GeneralCategory -> Bool
>= :: GeneralCategory -> GeneralCategory -> Bool
$cmax :: GeneralCategory -> GeneralCategory -> GeneralCategory
max :: GeneralCategory -> GeneralCategory -> GeneralCategory
$cmin :: GeneralCategory -> GeneralCategory -> GeneralCategory
min :: GeneralCategory -> GeneralCategory -> GeneralCategory
Ord      -- ^ @since base-2.01
                 , Int -> GeneralCategory
GeneralCategory -> Int
GeneralCategory -> [GeneralCategory]
GeneralCategory -> GeneralCategory
GeneralCategory -> GeneralCategory -> [GeneralCategory]
GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
(GeneralCategory -> GeneralCategory)
-> (GeneralCategory -> GeneralCategory)
-> (Int -> GeneralCategory)
-> (GeneralCategory -> Int)
-> (GeneralCategory -> [GeneralCategory])
-> (GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> (GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> (GeneralCategory
    -> GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> Enum GeneralCategory
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: GeneralCategory -> GeneralCategory
succ :: GeneralCategory -> GeneralCategory
$cpred :: GeneralCategory -> GeneralCategory
pred :: GeneralCategory -> GeneralCategory
$ctoEnum :: Int -> GeneralCategory
toEnum :: Int -> GeneralCategory
$cfromEnum :: GeneralCategory -> Int
fromEnum :: GeneralCategory -> Int
$cenumFrom :: GeneralCategory -> [GeneralCategory]
enumFrom :: GeneralCategory -> [GeneralCategory]
$cenumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
$cenumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
$cenumFromThenTo :: GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromThenTo :: GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
Enum     -- ^ @since base-2.01
                 , GeneralCategory
GeneralCategory -> GeneralCategory -> Bounded GeneralCategory
forall a. a -> a -> Bounded a
$cminBound :: GeneralCategory
minBound :: GeneralCategory
$cmaxBound :: GeneralCategory
maxBound :: GeneralCategory
Bounded  -- ^ @since base-2.01
                 , Ord GeneralCategory
Ord GeneralCategory =>
((GeneralCategory, GeneralCategory) -> [GeneralCategory])
-> ((GeneralCategory, GeneralCategory) -> GeneralCategory -> Int)
-> ((GeneralCategory, GeneralCategory) -> GeneralCategory -> Int)
-> ((GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool)
-> ((GeneralCategory, GeneralCategory) -> Int)
-> ((GeneralCategory, GeneralCategory) -> Int)
-> Ix GeneralCategory
(GeneralCategory, GeneralCategory) -> Int
(GeneralCategory, GeneralCategory) -> [GeneralCategory]
(GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool
(GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
forall a.
Ord a =>
((a, a) -> [a])
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Bool)
-> ((a, a) -> Int)
-> ((a, a) -> Int)
-> Ix a
$crange :: (GeneralCategory, GeneralCategory) -> [GeneralCategory]
range :: (GeneralCategory, GeneralCategory) -> [GeneralCategory]
$cindex :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
index :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
$cunsafeIndex :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
unsafeIndex :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
$cinRange :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool
inRange :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool
$crangeSize :: (GeneralCategory, GeneralCategory) -> Int
rangeSize :: (GeneralCategory, GeneralCategory) -> Int
$cunsafeRangeSize :: (GeneralCategory, GeneralCategory) -> Int
unsafeRangeSize :: (GeneralCategory, GeneralCategory) -> Int
Ix       -- ^ @since base-2.01
                 )

-- | The Unicode general category of the character. This relies on the
-- 'Enum' instance of 'GeneralCategory', which must remain in the
-- same order as the categories are presented in the Unicode
-- standard.
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> generalCategory 'a'
-- LowercaseLetter
-- >>> generalCategory 'A'
-- UppercaseLetter
-- >>> generalCategory '0'
-- DecimalNumber
-- >>> generalCategory '%'
-- OtherPunctuation
-- >>> generalCategory '♥'
-- OtherSymbol
-- >>> generalCategory '\31'
-- Control
-- >>> generalCategory ' '
-- Space
--
{-# INLINE generalCategory #-}
generalCategory :: Char -> GeneralCategory
generalCategory :: Char -> GeneralCategory
generalCategory = Int -> GeneralCategory
forall a. Enum a => Int -> a
toEnum (Int -> GeneralCategory)
-> (Char -> Int) -> Char -> GeneralCategory
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
GC.generalCategory

-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii                 :: Char -> Bool
isAscii :: Char -> Bool
isAscii Char
c               =  Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<  Char
'\x80'

-- | Selects the first 256 characters of the Unicode character set,
-- corresponding to the ISO 8859-1 (Latin-1) character set.
isLatin1                :: Char -> Bool
isLatin1 :: Char -> Bool
isLatin1 Char
c              =  Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xff'

-- | Selects ASCII lower-case letters,
-- i.e. characters satisfying both 'isAscii' and 'isLower'.
isAsciiLower :: Char -> Bool
isAsciiLower :: Char -> Bool
isAsciiLower Char
c          =  Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'a' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'z'

-- | Selects ASCII upper-case letters,
-- i.e. characters satisfying both 'isAscii' and 'isUpper'.
isAsciiUpper :: Char -> Bool
isAsciiUpper :: Char -> Bool
isAsciiUpper Char
c          =  Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'A' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'Z'

-- | Selects control characters, which are the non-printing characters of
-- the Latin-1 subset of Unicode.
isControl               :: Char -> Bool
-- Select characters with category 'Control'.
-- By definition (https://www.unicode.org/reports/tr44/#General_Category_Values)
-- “a C0 or C1 control code”, i.e. the 0x00-0x1f, 0x7f, and 0x80-0x9f.
isControl :: Char -> Bool
isControl Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
Control -> Bool
True
        GeneralCategory
_       -> Bool
False

-- | Selects printable Unicode characters
-- (letters, numbers, marks, punctuation, symbols and spaces).
--
-- This function returns 'False' if its argument has one of the
-- following 'GeneralCategory's, or 'True' otherwise:
--
-- * 'LineSeparator'
-- * 'ParagraphSeparator'
-- * 'Control'
-- * 'Format'
-- * 'Surrogate'
-- * 'PrivateUse'
-- * 'NotAssigned'
isPrint                 :: Char -> Bool
isPrint :: Char -> Bool
isPrint Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
LineSeparator      -> Bool
False
        GeneralCategory
ParagraphSeparator -> Bool
False
        GeneralCategory
Control            -> Bool
False
        GeneralCategory
Format             -> Bool
False
        GeneralCategory
Surrogate          -> Bool
False
        GeneralCategory
PrivateUse         -> Bool
False
        GeneralCategory
NotAssigned        -> Bool
False
        GeneralCategory
_                  -> Bool
True

-- | Returns 'True' for any Unicode space character, and the control
-- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.
isSpace                 :: Char -> Bool
-- isSpace includes non-breaking space
-- The magic 0x377 isn't really that magical. As of 2014, all the codepoints
-- at or below 0x377 have been assigned, so we shouldn't have to worry about
-- any new spaces appearing below there. It would probably be best to
-- use branchless ||, but currently the eqLit transformation will undo that,
-- so we'll do it like this until there's a way around that.
isSpace :: Char -> Bool
isSpace Char
c
  | Word
uc Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0x377 = Word
uc Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
32 Bool -> Bool -> Bool
|| Word
uc Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x9 Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
4 Bool -> Bool -> Bool
|| Word
uc Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0xa0
  | Bool
otherwise = Char -> GeneralCategory
generalCategory Char
c GeneralCategory -> GeneralCategory -> Bool
forall a. Eq a => a -> a -> Bool
== GeneralCategory
Space
  where
    uc :: Word
uc = Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) :: Word

-- | Selects upper-case or title-case alphabetic Unicode characters (letters).
-- Title case is used by a small number of letter ligatures like the
-- single-character form of /Lj/.
--
-- __Note:__ this predicate does /not/ work for letter-like characters such as:
-- @\'Ⓐ\'@ (@U+24B6@ circled Latin capital letter A) and
-- @\'Ⅳ\'@ (@U+2163@ Roman numeral four). This is due to selecting only
-- characters with the 'GeneralCategory' 'UppercaseLetter' or 'TitlecaseLetter'.
--
-- See 'isUpperCase' for a more intuitive predicate. Note that
-- unlike 'isUpperCase', 'isUpper' does select /title-case/ characters such as
-- @\'Dž\'@ (@U+01C5@ Latin capital letter d with small letter z with caron) or
-- @\'ᾯ\'@ (@U+1FAF@ Greek capital letter omega with dasia and perispomeni and
-- prosgegrammeni).
isUpper                 :: Char -> Bool
isUpper :: Char -> Bool
isUpper Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
UppercaseLetter -> Bool
True
        GeneralCategory
TitlecaseLetter -> Bool
True
        GeneralCategory
_               -> Bool
False

-- | Selects upper-case Unicode letter-like characters.
--
-- __Note:__ this predicate selects characters with the Unicode property
-- @Uppercase@, which include letter-like characters such as:
-- @\'Ⓐ\'@ (@U+24B6@ circled Latin capital letter A) and
-- @\'Ⅳ\'@ (@U+2163@ Roman numeral four).
--
-- See 'isUpper' for the legacy predicate. Note that
-- unlike 'isUpperCase', 'isUpper' does select /title-case/ characters such as
-- @\'Dž\'@ (@U+01C5@ Latin capital letter d with small letter z with caron) or
-- @\'ᾯ\'@ (@U+1FAF@ Greek capital letter omega with dasia and perispomeni and
-- prosgegrammeni).
--
-- @since base-4.18.0.0
{-# INLINE isUpperCase #-}
isUpperCase             :: Char -> Bool
isUpperCase :: Char -> Bool
isUpperCase = Char -> Bool
DCP.isUppercase

-- | Selects lower-case alphabetic Unicode characters (letters).
--
-- __Note:__ this predicate does /not/ work for letter-like characters such as:
-- @\'ⓐ\'@ (@U+24D0@ circled Latin small letter a) and
-- @\'ⅳ\'@ (@U+2173@ small Roman numeral four). This is due to selecting only
-- characters with the 'GeneralCategory' 'LowercaseLetter'.
--
-- See 'isLowerCase' for a more intuitive predicate.
isLower                 :: Char -> Bool
isLower :: Char -> Bool
isLower Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
LowercaseLetter -> Bool
True
        GeneralCategory
_               -> Bool
False

-- | Selects lower-case Unicode letter-like characters.
--
-- __Note:__ this predicate selects characters with the Unicode property
-- @Lowercase@, which includes letter-like characters such as:
-- @\'ⓐ\'@ (@U+24D0@ circled Latin small letter a) and
-- @\'ⅳ\'@ (@U+2173@ small Roman numeral four).
--
-- See 'isLower' for the legacy predicate.
--
-- @since base-4.18.0.0
{-# INLINE isLowerCase #-}
isLowerCase             :: Char -> Bool
isLowerCase :: Char -> Bool
isLowerCase = Char -> Bool
DCP.isLowercase

-- | Selects alphabetic Unicode characters (lower-case, upper-case and
-- title-case letters, plus letters of caseless scripts and modifiers letters).
-- This function is equivalent to 'Data.Char.isLetter'.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'UppercaseLetter'
-- * 'LowercaseLetter'
-- * 'TitlecaseLetter'
-- * 'ModifierLetter'
-- * 'OtherLetter'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Letter\".
isAlpha                 :: Char -> Bool
isAlpha :: Char -> Bool
isAlpha Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
UppercaseLetter -> Bool
True
        GeneralCategory
LowercaseLetter -> Bool
True
        GeneralCategory
TitlecaseLetter -> Bool
True
        GeneralCategory
ModifierLetter  -> Bool
True
        GeneralCategory
OtherLetter     -> Bool
True
        GeneralCategory
_               -> Bool
False

-- | Selects alphabetic or numeric Unicode characters.
--
-- Note that numeric digits outside the ASCII range, as well as numeric
-- characters which aren't digits, are selected by this function but not by
-- 'isDigit'. Such characters may be part of identifiers but are not used by
-- the printer and reader to represent numbers, e.g., Roman numerals like @'V'@,
-- full-width digits like @'1'@ (aka @'\65297'@).
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'UppercaseLetter'
-- * 'LowercaseLetter'
-- * 'TitlecaseLetter'
-- * 'ModifierLetter'
-- * 'OtherLetter'
-- * 'DecimalNumber'
-- * 'LetterNumber'
-- * 'OtherNumber'
isAlphaNum              :: Char -> Bool
isAlphaNum :: Char -> Bool
isAlphaNum Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
UppercaseLetter -> Bool
True
        GeneralCategory
LowercaseLetter -> Bool
True
        GeneralCategory
TitlecaseLetter -> Bool
True
        GeneralCategory
ModifierLetter  -> Bool
True
        GeneralCategory
OtherLetter     -> Bool
True
        GeneralCategory
DecimalNumber   -> Bool
True
        GeneralCategory
LetterNumber    -> Bool
True
        GeneralCategory
OtherNumber     -> Bool
True
        GeneralCategory
_               -> Bool
False


-- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.
isDigit                 :: Char -> Bool
isDigit :: Char -> Bool
isDigit Char
c               =  (Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0') :: Word) Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
9

-- We use an addition and an unsigned comparison instead of two signed
-- comparisons because it's usually faster and puts less strain on branch
-- prediction. It likely also enables some CSE when combined with functions
-- that follow up with an actual conversion.

-- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.
isOctDigit              :: Char -> Bool
isOctDigit :: Char -> Bool
isOctDigit Char
c            =  (Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0') :: Word) Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
7

-- | Selects ASCII hexadecimal digits,
-- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@.
isHexDigit              :: Char -> Bool
isHexDigit :: Char -> Bool
isHexDigit Char
c            =  Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
||
                           (Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'A')::Word) Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
5 Bool -> Bool -> Bool
||
                           (Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'a')::Word) Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
5

-- | Selects Unicode punctuation characters, including various kinds
-- of connectors, brackets and quotes.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'ConnectorPunctuation'
-- * 'DashPunctuation'
-- * 'OpenPunctuation'
-- * 'ClosePunctuation'
-- * 'InitialQuote'
-- * 'FinalQuote'
-- * 'OtherPunctuation'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Punctuation\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isPunctuation 'a'
-- False
-- >>> isPunctuation '7'
-- False
-- >>> isPunctuation '♥'
-- False
-- >>> isPunctuation '"'
-- True
-- >>> isPunctuation '?'
-- True
-- >>> isPunctuation '—'
-- True
--
isPunctuation :: Char -> Bool
isPunctuation :: Char -> Bool
isPunctuation Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
ConnectorPunctuation    -> Bool
True
        GeneralCategory
DashPunctuation         -> Bool
True
        GeneralCategory
OpenPunctuation         -> Bool
True
        GeneralCategory
ClosePunctuation        -> Bool
True
        GeneralCategory
InitialQuote            -> Bool
True
        GeneralCategory
FinalQuote              -> Bool
True
        GeneralCategory
OtherPunctuation        -> Bool
True
        GeneralCategory
_                       -> Bool
False

-- | Selects Unicode symbol characters, including mathematical and
-- currency symbols.
--
-- This function returns 'True' if its argument has one of the
-- following 'GeneralCategory's, or 'False' otherwise:
--
-- * 'MathSymbol'
-- * 'CurrencySymbol'
-- * 'ModifierSymbol'
-- * 'OtherSymbol'
--
-- These classes are defined in the
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
-- part of the Unicode standard. The same document defines what is
-- and is not a \"Symbol\".
--
-- ==== __Examples__
--
-- Basic usage:
--
-- >>> isSymbol 'a'
-- False
-- >>> isSymbol '6'
-- False
-- >>> isSymbol '='
-- True
--
-- The definition of \"math symbol\" may be a little
-- counter-intuitive depending on one's background:
--
-- >>> isSymbol '+'
-- True
-- >>> isSymbol '-'
-- False
--
isSymbol :: Char -> Bool
isSymbol :: Char -> Bool
isSymbol Char
c = case Char -> GeneralCategory
generalCategory Char
c of
        GeneralCategory
MathSymbol              -> Bool
True
        GeneralCategory
CurrencySymbol          -> Bool
True
        GeneralCategory
ModifierSymbol          -> Bool
True
        GeneralCategory
OtherSymbol             -> Bool
True
        GeneralCategory
_                       -> Bool
False

-- | Convert a letter to the corresponding upper-case letter, if any.
-- Any other character is returned unchanged.
{-# INLINE toUpper #-}
toUpper                 :: Char -> Char
toUpper :: Char -> Char
toUpper = Char -> Char
C.toSimpleUpperCase

-- | Convert a letter to the corresponding lower-case letter, if any.
-- Any other character is returned unchanged.
{-# INLINE toLower #-}
toLower                 :: Char -> Char
toLower :: Char -> Char
toLower = Char -> Char
C.toSimpleLowerCase

-- | Convert a letter to the corresponding title-case or upper-case
-- letter, if any.  (Title case differs from upper case only for a small
-- number of ligature letters.)
-- Any other character is returned unchanged.
{-# INLINE toTitle #-}
toTitle                 :: Char -> Char
toTitle :: Char -> Char
toTitle = Char -> Char
C.toSimpleTitleCase