{-# LANGUAGE LambdaCase #-}

module GHC.Parser.String (
  StringLexError (..),
  ContainsSmartQuote (..),
  LexStringType (..),
  lexString,

  -- * Unicode smart quote helpers
  isDoubleSmartQuote,
  isSingleSmartQuote,

  -- * Other helpers
  isAnyChar,
  resolveEscapeCharacter,
) where

import GHC.Prelude

import Control.Arrow ((>>>))
import Control.Monad (guard, unless, when)
import Data.Char (chr, isPrint, ord)
import Data.List (unfoldr)
import Data.Maybe (listToMaybe, mapMaybe)
import GHC.Parser.CharClass (
  hexDigit,
  is_any,
  is_decdigit,
  is_hexdigit,
  is_octdigit,
  is_space,
  octDecDigit,
 )
import GHC.Utils.Panic (panic)

data LexStringType = StringTypeSingle | StringTypeMulti

-- | State to accumulate while iterating through string literal.
--
-- Fields are strict here to avoid space leak when iterating through large string literal
-- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
data LexStringState loc = LexStringState
  { forall loc. LexStringState loc -> String
stringAcc :: !String
    -- ^ The string seen so far, reversed
  , forall loc. LexStringState loc -> Int
multilineCommonWsPrefix :: !Int
    -- ^ The common prefix for multiline strings. See Note [Multiline string literals]
  , forall loc. LexStringState loc -> loc
initialLoc :: !loc
    -- ^ The location of the beginning of the string literal
  }

-- | Get the character at the given location, with the location
-- of the next character. Returns Nothing if at the end of the
-- input.
type GetChar loc = loc -> Maybe (Char, loc)

lexString :: LexStringType -> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
lexString :: forall loc.
LexStringType
-> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
lexString LexStringType
strType GetChar loc
getChar loc
initialLoc = LexStringState loc
-> loc -> Either (StringLexError loc) (String, loc)
go LexStringState loc
initialState loc
initialLoc
  where
    initialState :: LexStringState loc
initialState =
      LexStringState
        { stringAcc :: String
stringAcc = String
""
        , multilineCommonWsPrefix :: Int
multilineCommonWsPrefix =
            case LexStringType
strType of
              LexStringType
StringTypeMulti -> Int
forall a. Bounded a => a
maxBound
              LexStringType
_ -> Int
0
        , initialLoc :: loc
initialLoc = loc
initialLoc
        }

    -- 's' is strict here to avoid space leak when iterating through large string literal
    -- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
    go :: LexStringState loc
-> loc -> Either (StringLexError loc) (String, loc)
go !LexStringState loc
s loc
loc0 =
      case GetChar loc
getChar loc
loc0 of
        -- found closing delimiter
        Just (Char
'"', loc
_) | Just loc
loc1 <- LexStringType -> GetChar loc -> loc -> Maybe loc
forall loc. LexStringType -> GetChar loc -> loc -> Maybe loc
checkDelimiter LexStringType
strType GetChar loc
getChar loc
loc0 -> do
          let postprocess :: String -> String
postprocess =
                case LexStringType
strType of
                  LexStringType
StringTypeSingle -> String -> String
forall a. a -> a
id
                  LexStringType
StringTypeMulti -> Int -> String -> String
postprocessMultiline (LexStringState loc -> Int
forall loc. LexStringState loc -> Int
multilineCommonWsPrefix LexStringState loc
s)
          (String, loc) -> Either (StringLexError loc) (String, loc)
forall a b. b -> Either a b
Right (String -> String
postprocess (String -> String) -> (String -> String) -> String -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
forall a. [a] -> [a]
reverse (String -> String) -> String -> String
forall a b. (a -> b) -> a -> b
$ LexStringState loc -> String
forall loc. LexStringState loc -> String
stringAcc LexStringState loc
s, loc
loc1)

        -- found backslash
        Just (c0 :: Char
c0@Char
'\\', loc
loc1) -> do
          case GetChar loc
getChar loc
loc1 of
            -- found '\&' character, which should be elided
            Just (Char
'&', loc
loc2) -> LexStringState loc
-> loc -> Either (StringLexError loc) (String, loc)
go LexStringState loc
s loc
loc2
            -- found start of a string gap
            Just (Char
c1, loc
loc2) | Char -> Bool
is_space Char
c1 -> GetChar loc
-> LexStringState loc -> loc -> Either (StringLexError loc) loc
forall loc.
GetChar loc
-> LexStringState loc -> loc -> Either (StringLexError loc) loc
collapseStringGap GetChar loc
getChar LexStringState loc
s loc
loc2 Either (StringLexError loc) loc
-> (loc -> Either (StringLexError loc) (String, loc))
-> Either (StringLexError loc) (String, loc)
forall a b.
Either (StringLexError loc) a
-> (a -> Either (StringLexError loc) b)
-> Either (StringLexError loc) b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= LexStringState loc
-> loc -> Either (StringLexError loc) (String, loc)
go LexStringState loc
s
            -- some other escape character
            Just (Char
c1, loc
loc2) ->
              case LexStringType
strType of
                LexStringType
StringTypeSingle -> do
                  (c', loc') <- GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
forall loc.
GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
resolveEscapeCharacter GetChar loc
getChar loc
loc1
                  go (addChar c' s) loc'
                LexStringType
StringTypeMulti -> do
                  -- keep escape characters unresolved until after post-processing,
                  -- to distinguish between a user-newline and the user writing "\n".
                  -- but still process the characters here, to find any errors
                  _ <- GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
forall loc.
GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
resolveEscapeCharacter GetChar loc
getChar loc
loc1
                  go (addChar c1 . addChar c0 $ s) loc2
            -- backslash at end of input
            Maybe (Char, loc)
Nothing -> StringLexError loc -> Either (StringLexError loc) (String, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (String, loc))
-> StringLexError loc -> Either (StringLexError loc) (String, loc)
forall a b. (a -> b) -> a -> b
$ loc -> ContainsSmartQuote loc -> StringLexError loc
forall loc. loc -> ContainsSmartQuote loc -> StringLexError loc
BadCharInitialLex loc
loc1 (GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
forall loc.
GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote GetChar loc
getChar LexStringState loc
s)

        -- found newline character in multiline string
        Just (c0 :: Char
c0@Char
'\n', loc
loc1) | LexStringType
StringTypeMulti <- LexStringType
strType ->
          (LexStringState loc
 -> loc -> Either (StringLexError loc) (String, loc))
-> (LexStringState loc, loc)
-> Either (StringLexError loc) (String, loc)
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry LexStringState loc
-> loc -> Either (StringLexError loc) (String, loc)
go ((LexStringState loc, loc)
 -> Either (StringLexError loc) (String, loc))
-> (LexStringState loc, loc)
-> Either (StringLexError loc) (String, loc)
forall a b. (a -> b) -> a -> b
$ GetChar loc
-> LexStringState loc -> loc -> (LexStringState loc, loc)
forall loc.
GetChar loc
-> LexStringState loc -> loc -> (LexStringState loc, loc)
parseLeadingWS GetChar loc
getChar (Char -> LexStringState loc -> LexStringState loc
forall loc. Char -> LexStringState loc -> LexStringState loc
addChar Char
c0 LexStringState loc
s) loc
loc1

        -- found some other character
        Just (Char
c0, loc
loc1) | Char -> Bool
isAnyChar Char
c0 -> LexStringState loc
-> loc -> Either (StringLexError loc) (String, loc)
go (Char -> LexStringState loc -> LexStringState loc
forall loc. Char -> LexStringState loc -> LexStringState loc
addChar Char
c0 LexStringState loc
s) loc
loc1

        -- found some unknown character
        Just (Char
_, loc
_) -> StringLexError loc -> Either (StringLexError loc) (String, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (String, loc))
-> StringLexError loc -> Either (StringLexError loc) (String, loc)
forall a b. (a -> b) -> a -> b
$ loc -> ContainsSmartQuote loc -> StringLexError loc
forall loc. loc -> ContainsSmartQuote loc -> StringLexError loc
BadCharInitialLex loc
loc0 (GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
forall loc.
GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote GetChar loc
getChar LexStringState loc
s)

        -- reached EOF before finding end of string
        Maybe (Char, loc)
Nothing -> StringLexError loc -> Either (StringLexError loc) (String, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (String, loc))
-> StringLexError loc -> Either (StringLexError loc) (String, loc)
forall a b. (a -> b) -> a -> b
$ loc -> ContainsSmartQuote loc -> StringLexError loc
forall loc. loc -> ContainsSmartQuote loc -> StringLexError loc
BadCharInitialLex loc
loc0 (GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
forall loc.
GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote GetChar loc
getChar LexStringState loc
s)
{-# INLINE lexString #-}

checkDelimiter :: LexStringType -> GetChar loc -> loc -> Maybe loc
checkDelimiter :: forall loc. LexStringType -> GetChar loc -> loc -> Maybe loc
checkDelimiter LexStringType
strType GetChar loc
getChar loc
loc0 =
  case LexStringType
strType of
    LexStringType
StringTypeSingle -> do
      ('"', loc1) <- GetChar loc
getChar loc
loc0
      Just loc1
    LexStringType
StringTypeMulti -> do
      ('"', loc1) <- GetChar loc
getChar loc
loc0
      ('"', loc2) <- getChar loc1
      ('"', loc3) <- getChar loc2
      Just loc3
{-# INLINE checkDelimiter #-}

-- | A helper for adding the given character to the lexed string.
addChar :: Char -> LexStringState loc -> LexStringState loc
addChar :: forall loc. Char -> LexStringState loc -> LexStringState loc
addChar Char
c LexStringState loc
s = LexStringState loc
s{stringAcc = c : stringAcc s}
{-# INLINE addChar #-}

-- | Return whether the string we've parsed so far contains any smart quotes.
hasSQuote :: GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote :: forall loc.
GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote GetChar loc
getChar LexStringState loc
s
  | (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any Char -> Bool
isDoubleSmartQuote (LexStringState loc -> String
forall loc. LexStringState loc -> String
stringAcc LexStringState loc
s)
  , (Char
c, loc
loc) : [(Char, loc)]
_ <- ((Char, loc) -> Bool) -> [(Char, loc)] -> [(Char, loc)]
forall a. (a -> Bool) -> [a] -> [a]
filter (Char -> Bool
isDoubleSmartQuote (Char -> Bool) -> ((Char, loc) -> Char) -> (Char, loc) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char, loc) -> Char
forall a b. (a, b) -> a
fst) [(Char, loc)]
allChars =
      Char -> loc -> ContainsSmartQuote loc
forall loc. Char -> loc -> ContainsSmartQuote loc
SmartQuote Char
c loc
loc
  | Bool
otherwise =
      ContainsSmartQuote loc
forall loc. ContainsSmartQuote loc
NoSmartQuote
  where
    allChars :: [(Char, loc)]
allChars = (loc -> Maybe ((Char, loc), loc)) -> loc -> [(Char, loc)]
forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr loc -> Maybe ((Char, loc), loc)
getCharWithLoc (LexStringState loc -> loc
forall loc. LexStringState loc -> loc
initialLoc LexStringState loc
s)
    getCharWithLoc :: loc -> Maybe ((Char, loc), loc)
getCharWithLoc loc
loc =
      case GetChar loc
getChar loc
loc of
        Just (Char
c, loc
loc') -> ((Char, loc), loc) -> Maybe ((Char, loc), loc)
forall a. a -> Maybe a
Just ((Char
c, loc
loc), loc
loc')
        Maybe (Char, loc)
Nothing -> Maybe ((Char, loc), loc)
forall a. Maybe a
Nothing
{-# INLINE hasSQuote #-}

-- | After parsing a backslash and a space character, consume the rest of
-- the string gap and return the next location.
collapseStringGap :: GetChar loc -> LexStringState loc -> loc -> Either (StringLexError loc) loc
collapseStringGap :: forall loc.
GetChar loc
-> LexStringState loc -> loc -> Either (StringLexError loc) loc
collapseStringGap GetChar loc
getChar LexStringState loc
s = loc -> Either (StringLexError loc) loc
go
  where
    go :: loc -> Either (StringLexError loc) loc
go loc
loc0 =
      case GetChar loc
getChar loc
loc0 of
        Just (Char
'\\', loc
loc1) -> loc -> Either (StringLexError loc) loc
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure loc
loc1
        Just (Char
c0, loc
loc1) | Char -> Bool
is_space Char
c0 -> loc -> Either (StringLexError loc) loc
go loc
loc1
        Just (Char, loc)
_ -> StringLexError loc -> Either (StringLexError loc) loc
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) loc)
-> StringLexError loc -> Either (StringLexError loc) loc
forall a b. (a -> b) -> a -> b
$ loc -> ContainsSmartQuote loc -> StringLexError loc
forall loc. loc -> ContainsSmartQuote loc -> StringLexError loc
BadCharInitialLex loc
loc0 (GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
forall loc.
GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote GetChar loc
getChar LexStringState loc
s)
        Maybe (Char, loc)
Nothing -> StringLexError loc -> Either (StringLexError loc) loc
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) loc)
-> StringLexError loc -> Either (StringLexError loc) loc
forall a b. (a -> b) -> a -> b
$ loc -> ContainsSmartQuote loc -> StringLexError loc
forall loc. loc -> ContainsSmartQuote loc -> StringLexError loc
UnexpectedEOF loc
loc0 (GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
forall loc.
GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
hasSQuote GetChar loc
getChar LexStringState loc
s)
{-# INLINE collapseStringGap #-}

-- | See Note [Multiline string literals]
parseLeadingWS :: GetChar loc -> LexStringState loc -> loc -> (LexStringState loc, loc)
parseLeadingWS :: forall loc.
GetChar loc
-> LexStringState loc -> loc -> (LexStringState loc, loc)
parseLeadingWS GetChar loc
getChar = Int -> LexStringState loc -> loc -> (LexStringState loc, loc)
go Int
0
  where
    go :: Int -> LexStringState loc -> loc -> (LexStringState loc, loc)
go !Int
col LexStringState loc
s loc
loc =
      case GetChar loc
getChar loc
loc of
        Just (c :: Char
c@Char
' ', loc
loc') -> Int -> LexStringState loc -> loc -> (LexStringState loc, loc)
go (Int
col Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Char -> LexStringState loc -> LexStringState loc
forall loc. Char -> LexStringState loc -> LexStringState loc
addChar Char
c LexStringState loc
s) loc
loc'
        -- expand tabs
        Just (Char
'\t', loc
loc') ->
          let fill :: Int
fill = Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
- (Int
col Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
8)
              s' :: LexStringState loc
s' = Int
-> (LexStringState loc -> LexStringState loc)
-> LexStringState loc
-> LexStringState loc
forall a. Int -> (a -> a) -> a -> a
applyN Int
fill (Char -> LexStringState loc -> LexStringState loc
forall loc. Char -> LexStringState loc -> LexStringState loc
addChar Char
' ') LexStringState loc
s
           in Int -> LexStringState loc -> loc -> (LexStringState loc, loc)
go (Int
col Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
fill) LexStringState loc
s' loc
loc'
        -- if we see a newline or string delimiter, then this line only contained whitespace, so
        -- don't include it in the common whitespace prefix
        Just (Char
'\n', loc
_) -> (LexStringState loc
s, loc
loc)
        Just (Char
'"', loc
_) | Just loc
_ <- LexStringType -> GetChar loc -> loc -> Maybe loc
forall loc. LexStringType -> GetChar loc -> loc -> Maybe loc
checkDelimiter LexStringType
StringTypeMulti GetChar loc
getChar loc
loc -> (LexStringState loc
s, loc
loc)
        -- found some other character, so we're done parsing leading whitespace
        Maybe (Char, loc)
_ ->
          let s' :: LexStringState loc
s' = LexStringState loc
s{multilineCommonWsPrefix = min col (multilineCommonWsPrefix s)}
           in (LexStringState loc
s', loc
loc)

    applyN :: Int -> (a -> a) -> a -> a
    applyN :: forall a. Int -> (a -> a) -> a -> a
applyN Int
n a -> a
f a
x0 = (a -> a) -> a -> [a]
forall a. (a -> a) -> a -> [a]
iterate a -> a
f a
x0 [a] -> Int -> a
forall a. HasCallStack => [a] -> Int -> a
!! Int
n
{-# INLINE parseLeadingWS #-}

data StringLexError loc
  = UnexpectedEOF !loc !(ContainsSmartQuote loc)
    -- ^ Unexpectedly hit EOF when lexing string
  | BadCharInitialLex !loc !(ContainsSmartQuote loc)
    -- ^ Found invalid character when initially lexing string
  | EscapeBadChar !loc
    -- ^ Found invalid character when parsing an escaped character
  | EscapeUnexpectedEOF !loc
    -- ^ Unexpectedly hit EOF when parsing an escaped character
  | EscapeNumRangeError !loc
    -- ^ Escaped number exceeds range
  | EscapeSmartQuoteError !Char !loc
    -- ^ Found escaped smart unicode chars as `\’` or `\”`
  deriving (Int -> StringLexError loc -> String -> String
[StringLexError loc] -> String -> String
StringLexError loc -> String
(Int -> StringLexError loc -> String -> String)
-> (StringLexError loc -> String)
-> ([StringLexError loc] -> String -> String)
-> Show (StringLexError loc)
forall loc.
Show loc =>
Int -> StringLexError loc -> String -> String
forall loc. Show loc => [StringLexError loc] -> String -> String
forall loc. Show loc => StringLexError loc -> String
forall a.
(Int -> a -> String -> String)
-> (a -> String) -> ([a] -> String -> String) -> Show a
$cshowsPrec :: forall loc.
Show loc =>
Int -> StringLexError loc -> String -> String
showsPrec :: Int -> StringLexError loc -> String -> String
$cshow :: forall loc. Show loc => StringLexError loc -> String
show :: StringLexError loc -> String
$cshowList :: forall loc. Show loc => [StringLexError loc] -> String -> String
showList :: [StringLexError loc] -> String -> String
Show)

-- | When initially lexing the string, we want to track if we've
-- seen a smart quote, to show a helpful "you might be accidentally
-- using a smart quote" error.
data ContainsSmartQuote loc
  = NoSmartQuote
  | SmartQuote !Char !loc
  deriving (Int -> ContainsSmartQuote loc -> String -> String
[ContainsSmartQuote loc] -> String -> String
ContainsSmartQuote loc -> String
(Int -> ContainsSmartQuote loc -> String -> String)
-> (ContainsSmartQuote loc -> String)
-> ([ContainsSmartQuote loc] -> String -> String)
-> Show (ContainsSmartQuote loc)
forall loc.
Show loc =>
Int -> ContainsSmartQuote loc -> String -> String
forall loc.
Show loc =>
[ContainsSmartQuote loc] -> String -> String
forall loc. Show loc => ContainsSmartQuote loc -> String
forall a.
(Int -> a -> String -> String)
-> (a -> String) -> ([a] -> String -> String) -> Show a
$cshowsPrec :: forall loc.
Show loc =>
Int -> ContainsSmartQuote loc -> String -> String
showsPrec :: Int -> ContainsSmartQuote loc -> String -> String
$cshow :: forall loc. Show loc => ContainsSmartQuote loc -> String
show :: ContainsSmartQuote loc -> String
$cshowList :: forall loc.
Show loc =>
[ContainsSmartQuote loc] -> String -> String
showList :: [ContainsSmartQuote loc] -> String -> String
Show)

-- -----------------------------------------------------------------------------
-- Escape characters

-- | After finding a backslash, parse the rest of the escape character, starting
-- at the given location.
resolveEscapeCharacter :: GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
resolveEscapeCharacter :: forall loc.
GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
resolveEscapeCharacter GetChar loc
getChar loc
loc0 = do
  (c0, loc1) <- loc -> Either (StringLexError loc) (Char, loc)
expectChar loc
loc0
  case c0 of
    Char
'a'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\a', loc
loc1)
    Char
'b'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\b', loc
loc1)
    Char
'f'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\f', loc
loc1)
    Char
'n'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\n', loc
loc1)
    Char
'r'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\r', loc
loc1)
    Char
't'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\t', loc
loc1)
    Char
'v'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\v', loc
loc1)
    Char
'\\' -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\\', loc
loc1)
    Char
'"'  -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\"', loc
loc1)
    Char
'\'' -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\'', loc
loc1)
    -- escape codes
    Char
'x' -> (Char -> Bool)
-> Int
-> (Char -> Int)
-> loc
-> Either (StringLexError loc) (Char, loc)
expectNum Char -> Bool
is_hexdigit Int
16 Char -> Int
hexDigit loc
loc1
    Char
'o' -> (Char -> Bool)
-> Int
-> (Char -> Int)
-> loc
-> Either (StringLexError loc) (Char, loc)
expectNum Char -> Bool
is_octdigit Int
8 Char -> Int
octDecDigit loc
loc1
    Char
_ | Char -> Bool
is_decdigit Char
c0 -> (Char -> Bool)
-> Int
-> (Char -> Int)
-> loc
-> Either (StringLexError loc) (Char, loc)
expectNum Char -> Bool
is_decdigit Int
10 Char -> Int
octDecDigit loc
loc0
    -- control characters (e.g. '\^M')
    Char
'^' -> do
      (c1, loc2) <- loc -> Either (StringLexError loc) (Char, loc)
expectChar loc
loc1
      unless ('@' <= c1 && c1 <= '_') $ Left $ EscapeBadChar loc1
      pure (chr $ ord c1 - ord '@', loc2)
    -- long form escapes (e.g. '\NUL')
    Char
_ | Just (Char
c1, loc
loc2) <- GetChar loc -> Char -> GetChar loc
forall loc. GetChar loc -> Char -> GetChar loc
parseLongEscape GetChar loc
getChar Char
c0 loc
loc1 -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
c1, loc
loc2)
    -- check unicode smart quotes (#21843)
    Char
_ | Char -> Bool
isDoubleSmartQuote Char
c0 -> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (Char, loc))
-> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. (a -> b) -> a -> b
$ Char -> loc -> StringLexError loc
forall loc. Char -> loc -> StringLexError loc
EscapeSmartQuoteError Char
c0 loc
loc0
    Char
_ | Char -> Bool
isSingleSmartQuote Char
c0 -> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (Char, loc))
-> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. (a -> b) -> a -> b
$ Char -> loc -> StringLexError loc
forall loc. Char -> loc -> StringLexError loc
EscapeSmartQuoteError Char
c0 loc
loc0
    -- unknown escape
    Char
_ -> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (Char, loc))
-> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. (a -> b) -> a -> b
$ loc -> StringLexError loc
forall loc. loc -> StringLexError loc
EscapeBadChar loc
loc0
  where
    expectChar :: loc -> Either (StringLexError loc) (Char, loc)
expectChar loc
loc =
      case GetChar loc
getChar loc
loc of
        Just (Char, loc)
x -> (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char, loc)
x
        Maybe (Char, loc)
Nothing -> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) (Char, loc))
-> StringLexError loc -> Either (StringLexError loc) (Char, loc)
forall a b. (a -> b) -> a -> b
$ loc -> StringLexError loc
forall loc. loc -> StringLexError loc
EscapeUnexpectedEOF loc
loc

    expectNum :: (Char -> Bool)
-> Int
-> (Char -> Int)
-> loc
-> Either (StringLexError loc) (Char, loc)
expectNum Char -> Bool
isDigit Int
base Char -> Int
toDigit loc
loc1 = do
      (c1, loc2) <- loc -> Either (StringLexError loc) (Char, loc)
expectChar loc
loc1
      unless (isDigit c1) $ Left $ EscapeBadChar loc1
      let parseNum Int
x loc
loc =
            case GetChar loc
getChar loc
loc of
              Just (Char
c, loc
loc') | Char -> Bool
isDigit Char
c -> do
                let x' :: Int
x' = Int
x Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
base Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
toDigit Char
c
                Bool
-> Either (StringLexError loc) () -> Either (StringLexError loc) ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
x' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0x10ffff) (Either (StringLexError loc) () -> Either (StringLexError loc) ())
-> Either (StringLexError loc) () -> Either (StringLexError loc) ()
forall a b. (a -> b) -> a -> b
$ StringLexError loc -> Either (StringLexError loc) ()
forall a b. a -> Either a b
Left (StringLexError loc -> Either (StringLexError loc) ())
-> StringLexError loc -> Either (StringLexError loc) ()
forall a b. (a -> b) -> a -> b
$ loc -> StringLexError loc
forall loc. loc -> StringLexError loc
EscapeNumRangeError loc
loc
                Int -> loc -> Either (StringLexError loc) (Char, loc)
parseNum Int
x' loc
loc'
              Maybe (Char, loc)
_ ->
                (Char, loc) -> Either (StringLexError loc) (Char, loc)
forall a. a -> Either (StringLexError loc) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int -> Char
chr Int
x, loc
loc)
      parseNum (toDigit c1) loc2
{-# INLINE resolveEscapeCharacter #-}

parseLongEscape :: GetChar loc -> Char -> loc -> Maybe (Char, loc)
parseLongEscape :: forall loc. GetChar loc -> Char -> GetChar loc
parseLongEscape GetChar loc
getChar Char
c0 loc
loc1 = [(Char, loc)] -> Maybe (Char, loc)
forall a. [a] -> Maybe a
listToMaybe ([(Char, loc)] -> Maybe (Char, loc))
-> [(Char, loc)] -> Maybe (Char, loc)
forall a b. (a -> b) -> a -> b
$ ((String, Char) -> Maybe (Char, loc))
-> [(String, Char)] -> [(Char, loc)]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe (String, Char) -> Maybe (Char, loc)
tryParse [(String, Char)]
longEscapeCodes
  where
    tryParse :: (String, Char) -> Maybe (Char, loc)
tryParse (String
prefix, Char
c) = do
      p0 : p <- String -> Maybe String
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure String
prefix
      guard (p0 == c0)          -- see if the first character matches
      loc <- parsePrefix loc1 p -- see if the rest of the prefix matches
      pure (c, loc)

    parsePrefix :: loc -> String -> Maybe loc
parsePrefix loc
loc = \case
      [] -> loc -> Maybe loc
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure loc
loc
      Char
p : String
ps -> do
        (c, loc') <- GetChar loc
getChar loc
loc
        guard (p == c)
        parsePrefix loc' ps

    longEscapeCodes :: [(String, Char)]
longEscapeCodes =
      [ (String
"NUL", Char
'\NUL')
      , (String
"SOH", Char
'\SOH')
      , (String
"STX", Char
'\STX')
      , (String
"ETX", Char
'\ETX')
      , (String
"EOT", Char
'\EOT')
      , (String
"ENQ", Char
'\ENQ')
      , (String
"ACK", Char
'\ACK')
      , (String
"BEL", Char
'\BEL')
      , (String
"BS", Char
'\BS')
      , (String
"HT", Char
'\HT')
      , (String
"LF", Char
'\LF')
      , (String
"VT", Char
'\VT')
      , (String
"FF", Char
'\FF')
      , (String
"CR", Char
'\CR')
      , (String
"SO", Char
'\SO')
      , (String
"SI", Char
'\SI')
      , (String
"DLE", Char
'\DLE')
      , (String
"DC1", Char
'\DC1')
      , (String
"DC2", Char
'\DC2')
      , (String
"DC3", Char
'\DC3')
      , (String
"DC4", Char
'\DC4')
      , (String
"NAK", Char
'\NAK')
      , (String
"SYN", Char
'\SYN')
      , (String
"ETB", Char
'\ETB')
      , (String
"CAN", Char
'\CAN')
      , (String
"EM", Char
'\EM')
      , (String
"SUB", Char
'\SUB')
      , (String
"ESC", Char
'\ESC')
      , (String
"FS", Char
'\FS')
      , (String
"GS", Char
'\GS')
      , (String
"RS", Char
'\RS')
      , (String
"US", Char
'\US')
      , (String
"SP", Char
'\SP')
      , (String
"DEL", Char
'\DEL')
      ]
{-# INLINE parseLongEscape #-}

-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)

isDoubleSmartQuote :: Char -> Bool
isDoubleSmartQuote :: Char -> Bool
isDoubleSmartQuote = \case
  Char
'“' -> Bool
True
  Char
'”' -> Bool
True
  Char
_ -> Bool
False

isSingleSmartQuote :: Char -> Bool
isSingleSmartQuote :: Char -> Bool
isSingleSmartQuote = \case
  Char
'‘' -> Bool
True
  Char
'’' -> Bool
True
  Char
_ -> Bool
False

{-
Note [Multiline string literals]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Multiline string literals were added following the acceptance of the
proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569

Multiline string literals are syntax sugar for normal string literals,
with an extra post processing step. This all happens in the Lexer; that
is, HsMultilineString will contain the post-processed string. This matches
the same behavior as HsString, which contains the normalized string
(see Note [Literal source text]).

The canonical steps for post processing a multiline string are:
1. Collapse string gaps
2. Split the string by newlines
3. Convert leading tabs into spaces
    * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
4. Remove common whitespace prefix in every line (see below)
5. If a line contains only whitespace, remove all of the whitespace
6. Join the string back with `\n` delimiters
7. If the first character of the string is a newline, remove it
8. Interpret escaped characters

However, for performance reasons, we do as much of this in one pass as possible:
1. As we lex the string, do the following steps as they appear:
    a. Collapse string gaps
    b. Keep track of the common whitespace prefix so far
    c. Validate escaped characters
2. At the very end, post process the lexed string:
    a. Remove the common whitespace prefix from every line
    b. Remove all whitespace from all-whitespace lines
    c. Remove initial newline character
    d. Resolve escaped characters

The common whitespace prefix can be informally defined as "The longest
prefix of whitespace shared by all lines in the string, excluding the
first line and any whitespace-only lines".

It's more precisely defined with the following algorithm:

1. Take a list representing the lines in the string
2. Ignore the following elements in the list:
    * The first line (we want to ignore everything before the first newline)
    * Empty lines
    * Lines with only whitespace characters
3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
-}

-- | See Note [Multiline string literals]
postprocessMultiline :: Int -> String -> String
postprocessMultiline :: Int -> String -> String
postprocessMultiline Int
commonWSPrefix =
      String -> String
rmCommonWhitespacePrefix
  (String -> String) -> (String -> String) -> String -> String
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> String -> String
collapseOnlyWsLines
  (String -> String) -> (String -> String) -> String -> String
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> String -> String
rmFirstNewline
  (String -> String) -> (String -> String) -> String -> String
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> String -> String
rmLastNewline
  (String -> String) -> (String -> String) -> String -> String
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> String -> String
resolveEscapeChars
  where
    rmCommonWhitespacePrefix :: String -> String
rmCommonWhitespacePrefix =
      let go :: String -> String
go = \case
            Char
'\n' : String
s -> Char
'\n' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
go (Int -> String -> String
forall {t}. (Ord t, Num t) => t -> String -> String
dropLine Int
commonWSPrefix String
s)
            Char
c : String
s -> Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
go String
s
            [] -> []
          -- drop x characters from the string, or up to a newline, whichever
          -- comes first
          dropLine :: t -> String -> String
dropLine !t
x = \case
            String
s | t
x t -> t -> Bool
forall a. Ord a => a -> a -> Bool
<= t
0 -> String
s
            s :: String
s@(Char
'\n' : String
_) -> String
s
            Char
_ : String
s -> t -> String -> String
dropLine (t
x t -> t -> t
forall a. Num a => a -> a -> a
- t
1) String
s
            [] -> []
       in String -> String
go

    collapseOnlyWsLines :: String -> String
collapseOnlyWsLines =
      let go :: String -> String
go = \case
            Char
'\n' : String
s | Just String
s' <- String -> Maybe String
checkAllWs String
s -> Char
'\n' Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
go String
s'
            Char
c : String
s -> Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
go String
s
            [] -> []
          checkAllWs :: String -> Maybe String
checkAllWs = \case
            -- got all the way to a newline or the end of the string, return
            s :: String
s@(Char
'\n' : String
_) -> String -> Maybe String
forall a. a -> Maybe a
Just String
s
            s :: String
s@[] -> String -> Maybe String
forall a. a -> Maybe a
Just String
s
            -- found whitespace, continue
            Char
c : String
s | Char -> Bool
is_space Char
c -> String -> Maybe String
checkAllWs String
s
            -- anything else, stop
            String
_ -> Maybe String
forall a. Maybe a
Nothing
       in String -> String
go

    rmFirstNewline :: String -> String
rmFirstNewline = \case
      Char
'\n' : String
s -> String
s
      String
s -> String
s

    rmLastNewline :: String -> String
rmLastNewline =
      let go :: String -> String
go = \case
            [] -> []
            [Char
'\n'] -> []
            Char
c : String
cs -> Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
go String
cs
       in String -> String
go

    -- resolve escape characters, deferred from lexString. guaranteed
    -- to not throw any errors, since we already checked them in lexString
    resolveEscapeChars :: String -> String
resolveEscapeChars = \case
      [] -> []
      Char
'\\' : String
s ->
        -- concretizing 'loc' to String:
        --   resolveEscapeCharacter :: (String -> Maybe (Char, String)) -> String -> Either _ (Char, String)
        case GetChar String
-> String -> Either (StringLexError String) (Char, String)
forall loc.
GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
resolveEscapeCharacter GetChar String
forall {a}. [a] -> Maybe (a, [a])
uncons String
s of
          Left StringLexError String
e -> String -> String
forall a. HasCallStack => String -> a
panic (String -> String) -> String -> String
forall a b. (a -> b) -> a -> b
$ String
"resolving escape characters in multiline string unexpectedly found errors: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ StringLexError String -> String
forall a. Show a => a -> String
show StringLexError String
e
          Right (Char
c, String
s') -> Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
resolveEscapeChars String
s'
      Char
c : String
s -> Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
resolveEscapeChars String
s

    uncons :: [a] -> Maybe (a, [a])
uncons = \case
      a
c : [a]
cs -> (a, [a]) -> Maybe (a, [a])
forall a. a -> Maybe a
Just (a
c, [a]
cs)
      [] -> Maybe (a, [a])
forall a. Maybe a
Nothing

-- -----------------------------------------------------------------------------
-- Helpers

isAnyChar :: Char -> Bool
isAnyChar :: Char -> Bool
isAnyChar Char
c
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> Char
'\x7f' = Char -> Bool
isPrint Char
c
  | Bool
otherwise  = Char -> Bool
is_any Char
c