{-# LANGUAGE MagicHash #-}

{- |
This module defines the types and functions necessary for an Alex-generated
lexer.

https://haskell-alex.readthedocs.io/en/latest/api.html#
-}
module GHC.Parser.Lexer.Interface (
  AlexInput (..),
  alexGetByte,
  alexInputPrevChar,

  -- * Helpers
  alexGetChar,
  adjustChar,
) where

import GHC.Prelude

import Data.Char (GeneralCategory (..), generalCategory, ord)
import Data.Word (Word8)
import GHC.Data.StringBuffer (StringBuffer, atEnd, nextChar, prevChar)
import GHC.Exts
import GHC.Types.SrcLoc (PsLoc, advancePsLoc)

data AlexInput = AI !PsLoc !StringBuffer deriving (Int -> AlexInput -> ShowS
[AlexInput] -> ShowS
AlexInput -> String
(Int -> AlexInput -> ShowS)
-> (AlexInput -> String)
-> ([AlexInput] -> ShowS)
-> Show AlexInput
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> AlexInput -> ShowS
showsPrec :: Int -> AlexInput -> ShowS
$cshow :: AlexInput -> String
show :: AlexInput -> String
$cshowList :: [AlexInput] -> ShowS
showList :: [AlexInput] -> ShowS
Show)

-- See Note [Unicode in Alex]
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
alexGetByte (AI PsLoc
loc StringBuffer
s)
  | StringBuffer -> Bool
atEnd StringBuffer
s   = Maybe (Word8, AlexInput)
forall a. Maybe a
Nothing
  | Bool
otherwise = Word8
byte Word8 -> Maybe (Word8, AlexInput) -> Maybe (Word8, AlexInput)
forall a b. a -> b -> b
`seq` PsLoc
loc' PsLoc -> Maybe (Word8, AlexInput) -> Maybe (Word8, AlexInput)
forall a b. a -> b -> b
`seq` StringBuffer
s' StringBuffer
-> Maybe (Word8, AlexInput) -> Maybe (Word8, AlexInput)
forall a b. a -> b -> b
`seq`
                --trace (show (ord c)) $
                (Word8, AlexInput) -> Maybe (Word8, AlexInput)
forall a. a -> Maybe a
Just (Word8
byte, (PsLoc -> StringBuffer -> AlexInput
AI PsLoc
loc' StringBuffer
s'))
  where (Char
c,StringBuffer
s') = StringBuffer -> (Char, StringBuffer)
nextChar StringBuffer
s
        loc' :: PsLoc
loc'   = PsLoc -> Char -> PsLoc
advancePsLoc PsLoc
loc Char
c
        byte :: Word8
byte   = Char -> Word8
adjustChar Char
c

-- Getting the previous 'Char' isn't enough here - we need to convert it into
-- the same format that 'alexGetByte' would have produced.
--
-- See Note [Unicode in Alex] and #13986.
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (AI PsLoc
_ StringBuffer
buf) = Int -> Char
unsafeChr (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Word8
adjustChar Char
pc))
  where pc :: Char
pc = StringBuffer -> Char -> Char
prevChar StringBuffer
buf Char
'\n'

-- backwards compatibility for Alex 2.x
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar :: AlexInput -> Maybe (Char, AlexInput)
alexGetChar AlexInput
inp = case AlexInput -> Maybe (Word8, AlexInput)
alexGetByte AlexInput
inp of
                    Maybe (Word8, AlexInput)
Nothing    -> Maybe (Char, AlexInput)
forall a. Maybe a
Nothing
                    Just (Word8
b,AlexInput
i) -> Char
c Char -> Maybe (Char, AlexInput) -> Maybe (Char, AlexInput)
forall a b. a -> b -> b
`seq` (Char, AlexInput) -> Maybe (Char, AlexInput)
forall a. a -> Maybe a
Just (Char
c,AlexInput
i)
                       where c :: Char
c = Int -> Char
unsafeChr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b

unsafeChr :: Int -> Char
unsafeChr :: Int -> Char
unsafeChr (I# Int#
c) = Char# -> Char
GHC.Exts.C# (Int# -> Char#
GHC.Exts.chr# Int#
c)

{-
Note [Unicode in Alex]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Although newer versions of Alex support unicode, this grammar is processed with
the old style '--latin1' behaviour. This means that when implementing the
functions

    alexGetByte       :: AlexInput -> Maybe (Word8,AlexInput)
    alexInputPrevChar :: AlexInput -> Char

which Alex uses to take apart our 'AlexInput', we must

  * return a latin1 character in the 'Word8' that 'alexGetByte' expects
  * return a latin1 character in 'alexInputPrevChar'.

We handle this in 'adjustChar' by squishing entire classes of unicode
characters into single bytes.
-}

{-# INLINE adjustChar #-}
adjustChar :: Char -> Word8
adjustChar :: Char -> Word8
adjustChar Char
c = Word8
adj_c
  where non_graphic :: Word8
non_graphic     = Word8
0x00
        upper :: Word8
upper           = Word8
0x01
        lower :: Word8
lower           = Word8
0x02
        digit :: Word8
digit           = Word8
0x03
        symbol :: Word8
symbol          = Word8
0x04
        space :: Word8
space           = Word8
0x05
        other_graphic :: Word8
other_graphic   = Word8
0x06
        uniidchar :: Word8
uniidchar       = Word8
0x07

        adj_c :: Word8
adj_c
          | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x07' = Word8
non_graphic
          | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x7f' = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c)
          -- Alex doesn't handle Unicode, so when Unicode
          -- character is encountered we output these values
          -- with the actual character value hidden in the state.
          | Bool
otherwise =
                -- NB: The logic behind these definitions is also reflected
                -- in "GHC.Utils.Lexeme"
                -- Any changes here should likely be reflected there.

                case Char -> GeneralCategory
generalCategory Char
c of
                  GeneralCategory
UppercaseLetter       -> Word8
upper
                  GeneralCategory
LowercaseLetter       -> Word8
lower
                  GeneralCategory
TitlecaseLetter       -> Word8
upper
                  GeneralCategory
ModifierLetter        -> Word8
uniidchar -- see #10196
                  GeneralCategory
OtherLetter           -> Word8
lower -- see #1103
                  GeneralCategory
NonSpacingMark        -> Word8
uniidchar -- see #7650
                  GeneralCategory
SpacingCombiningMark  -> Word8
other_graphic
                  GeneralCategory
EnclosingMark         -> Word8
other_graphic
                  GeneralCategory
DecimalNumber         -> Word8
digit
                  GeneralCategory
LetterNumber          -> Word8
digit
                  GeneralCategory
OtherNumber           -> Word8
digit -- see #4373
                  GeneralCategory
ConnectorPunctuation  -> Word8
symbol
                  GeneralCategory
DashPunctuation       -> Word8
symbol
                  GeneralCategory
OpenPunctuation       -> Word8
other_graphic
                  GeneralCategory
ClosePunctuation      -> Word8
other_graphic
                  GeneralCategory
InitialQuote          -> Word8
other_graphic
                  GeneralCategory
FinalQuote            -> Word8
other_graphic
                  GeneralCategory
OtherPunctuation      -> Word8
symbol
                  GeneralCategory
MathSymbol            -> Word8
symbol
                  GeneralCategory
CurrencySymbol        -> Word8
symbol
                  GeneralCategory
ModifierSymbol        -> Word8
symbol
                  GeneralCategory
OtherSymbol           -> Word8
symbol
                  GeneralCategory
Space                 -> Word8
space
                  GeneralCategory
_other                -> Word8
non_graphic