{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ViewPatterns #-}

module GHC.Parser.String (
  StringLexError (..),
  lexString,
  lexMultilineString,

  -- * Unicode smart quote helpers
  isDoubleSmartQuote,
  isSingleSmartQuote,
) where

import GHC.Prelude hiding (getChar)

import Control.Arrow ((>>>))
import Control.Monad (when)
import Data.Char (chr, ord)
import qualified Data.Foldable1 as Foldable1
import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (listToMaybe, mapMaybe)
import GHC.Data.StringBuffer (StringBuffer)
import qualified GHC.Data.StringBuffer as StringBuffer
import GHC.Parser.CharClass (
  hexDigit,
  is_decdigit,
  is_hexdigit,
  is_octdigit,
  is_space,
  octDecDigit,
 )
import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)

type BufPos = Int
data StringLexError = StringLexError LexErr BufPos

lexString :: Int -> StringBuffer -> Either StringLexError String
lexString :: Int -> StringBuffer -> Either StringLexError String
lexString = (String -> Either (Char, LexErr) String)
-> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
-> Int
-> StringBuffer
-> Either StringLexError String
lexStringWith String -> Either (Char, LexErr) String
forall c. HasChar c => [c] -> Either (c, LexErr) [c]
processChars [CharPos] -> Either (CharPos, LexErr) [CharPos]
forall c. HasChar c => [c] -> Either (c, LexErr) [c]
processChars
  where
    processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
    processChars :: forall c. HasChar c => [c] -> Either (c, LexErr) [c]
processChars =
          [c] -> [c]
forall c. HasChar c => [c] -> [c]
collapseGaps
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> Either (c, LexErr) [c]
forall c. HasChar c => [c] -> Either (c, LexErr) [c]
resolveEscapes

-- -----------------------------------------------------------------------------
-- Lexing interface

{-
Note [Lexing strings]
~~~~~~~~~~~~~~~~~~~~~

After verifying if a string is lexically valid with Alex, we still need to do
some post processing of the string, namely:
1. Collapse string gaps
2. Resolve escape characters

The problem: 'lexemeToString' is more performant than manually reading
characters from the StringBuffer. However, that completely erases the position
of each character, which we need in order to report the correct position for
error messages (e.g. when resolving escape characters).

So what we'll do is do two passes. The first pass is optimistic; just convert
to a plain String and process it. If this results in an error, we do a second
pass, this time where each character is annotated with its position. Now, the
error has all the information it needs.

Ideally, lexStringWith would take a single (forall c. HasChar c => ...) function,
but to help the specializer, we pass it in twice to concretize it for the two
types we actually use.
-}

-- | See Note [Lexing strings]
lexStringWith ::
  ([Char] -> Either (Char, LexErr) [Char])
  -> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
  -> Int
  -> StringBuffer
  -> Either StringLexError String
lexStringWith :: (String -> Either (Char, LexErr) String)
-> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
-> Int
-> StringBuffer
-> Either StringLexError String
lexStringWith String -> Either (Char, LexErr) String
processChars [CharPos] -> Either (CharPos, LexErr) [CharPos]
processCharsPos Int
len StringBuffer
buf =
  case String -> Either (Char, LexErr) String
processChars (String -> Either (Char, LexErr) String)
-> String -> Either (Char, LexErr) String
forall a b. (a -> b) -> a -> b
$ StringBuffer -> Int -> String
bufferChars StringBuffer
buf Int
len of
    Right String
s -> String -> Either StringLexError String
forall a b. b -> Either a b
Right String
s
    Left (Char, LexErr)
_ ->
      case [CharPos] -> Either (CharPos, LexErr) [CharPos]
processCharsPos ([CharPos] -> Either (CharPos, LexErr) [CharPos])
-> [CharPos] -> Either (CharPos, LexErr) [CharPos]
forall a b. (a -> b) -> a -> b
$ StringBuffer -> Int -> [CharPos]
bufferLocatedChars StringBuffer
buf Int
len of
        Right [CharPos]
_ -> String -> Either StringLexError String
forall a. HasCallStack => String -> a
panic String
"expected lex error on second pass"
        Left ((Char
_, Int
pos), LexErr
e) -> StringLexError -> Either StringLexError String
forall a b. a -> Either a b
Left (StringLexError -> Either StringLexError String)
-> StringLexError -> Either StringLexError String
forall a b. (a -> b) -> a -> b
$ LexErr -> Int -> StringLexError
StringLexError LexErr
e Int
pos

class HasChar c where
  getChar :: c -> Char
  setChar :: Char -> c -> c

instance HasChar Char where
  getChar :: Char -> Char
getChar = Char -> Char
forall a. a -> a
id
  setChar :: Char -> Char -> Char
setChar = Char -> Char -> Char
forall a b. a -> b -> a
const

instance HasChar (Char, x) where
  getChar :: (Char, x) -> Char
getChar = (Char, x) -> Char
forall a b. (a, b) -> a
fst
  setChar :: Char -> (Char, x) -> (Char, x)
setChar Char
c (Char
_, x
x) = (Char
c, x
x)

pattern Char :: HasChar c => Char -> c
pattern $mChar :: forall {r} {c}. HasChar c => c -> (Char -> r) -> ((# #) -> r) -> r
Char c <- (getChar -> c)
{-# COMPLETE Char #-}

bufferChars :: StringBuffer -> Int -> [Char]
bufferChars :: StringBuffer -> Int -> String
bufferChars = StringBuffer -> Int -> String
StringBuffer.lexemeToString

type CharPos = (Char, BufPos)

bufferLocatedChars :: StringBuffer -> Int -> [CharPos]
bufferLocatedChars :: StringBuffer -> Int -> [CharPos]
bufferLocatedChars StringBuffer
initialBuf Int
len = StringBuffer -> [CharPos]
go StringBuffer
initialBuf
  where
    go :: StringBuffer -> [CharPos]
go StringBuffer
buf
      | StringBuffer -> Bool
atEnd StringBuffer
buf = []
      | Bool
otherwise  =
          let (Char
c, StringBuffer
buf') = StringBuffer -> (Char, StringBuffer)
StringBuffer.nextChar StringBuffer
buf
           in (Char
c, StringBuffer -> Int
StringBuffer.cur StringBuffer
buf) CharPos -> [CharPos] -> [CharPos]
forall a. a -> [a] -> [a]
: StringBuffer -> [CharPos]
go StringBuffer
buf'

    atEnd :: StringBuffer -> Bool
atEnd StringBuffer
buf = StringBuffer -> StringBuffer -> Int
StringBuffer.byteDiff StringBuffer
initialBuf StringBuffer
buf Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
len

-- -----------------------------------------------------------------------------
-- Lexing phases

collapseGaps :: HasChar c => [c] -> [c]
collapseGaps :: forall c. HasChar c => [c] -> [c]
collapseGaps = [c] -> [c]
go
  where
    go :: [c] -> [c]
go = \case
      c1 :: c
c1@(Char Char
'\\') : c2 :: c
c2@(Char Char
c) : [c]
cs
        | Char -> Bool
is_space Char
c -> [c] -> [c]
go ([c] -> [c]) -> [c] -> [c]
forall a b. (a -> b) -> a -> b
$ [c] -> [c]
dropGap [c]
cs
        | Bool
otherwise  -> c
c1 c -> [c] -> [c]
forall a. a -> [a] -> [a]
: c
c2 c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
      c
c : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
      [] -> []

    dropGap :: [c] -> [c]
dropGap = \case
      Char Char
'\\' : [c]
cs -> [c]
cs
      c
_ : [c]
cs -> [c] -> [c]
dropGap [c]
cs
      [] -> String -> [c]
forall a. HasCallStack => String -> a
panic String
"gap unexpectedly ended"

resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c]
resolveEscapes :: forall c. HasChar c => [c] -> Either (c, LexErr) [c]
resolveEscapes = DList c -> [c] -> Either (c, LexErr) [c]
forall {a}. HasChar a => DList a -> [a] -> Either (a, LexErr) [a]
go DList c
forall a. DList a
dlistEmpty
  where
    go :: DList a -> [a] -> Either (a, LexErr) [a]
go !DList a
acc = \case
      [] -> [a] -> Either (a, LexErr) [a]
forall a. a -> Either (a, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([a] -> Either (a, LexErr) [a]) -> [a] -> Either (a, LexErr) [a]
forall a b. (a -> b) -> a -> b
$ DList a -> [a]
forall a. DList a -> [a]
dlistToList DList a
acc
      Char Char
'\\' : Char Char
'&' : [a]
cs -> DList a -> [a] -> Either (a, LexErr) [a]
go DList a
acc [a]
cs
      backslash :: a
backslash@(Char Char
'\\') : [a]
cs ->
        case [a] -> Either (a, LexErr) (Char, [a])
forall c. HasChar c => [c] -> Either (c, LexErr) (Char, [c])
resolveEscapeChar [a]
cs of
          Right (Char
esc, [a]
cs') -> DList a -> [a] -> Either (a, LexErr) [a]
go (DList a
acc DList a -> a -> DList a
forall a. DList a -> a -> DList a
`dlistSnoc` Char -> a -> a
forall c. HasChar c => Char -> c -> c
setChar Char
esc a
backslash) [a]
cs'
          Left (a
c, LexErr
e) -> (a, LexErr) -> Either (a, LexErr) [a]
forall a b. a -> Either a b
Left (a
c, LexErr
e)
      a
c : [a]
cs -> DList a -> [a] -> Either (a, LexErr) [a]
go (DList a
acc DList a -> a -> DList a
forall a. DList a -> a -> DList a
`dlistSnoc` a
c) [a]
cs

-- -----------------------------------------------------------------------------
-- Escape characters

-- | Resolve a escape character, after having just lexed a backslash.
-- Assumes escape character is valid.
resolveEscapeChar :: HasChar c => [c] -> Either (c, LexErr) (Char, [c])
resolveEscapeChar :: forall c. HasChar c => [c] -> Either (c, LexErr) (Char, [c])
resolveEscapeChar = \case
  Char Char
'a'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\a', [c]
cs)
  Char Char
'b'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\b', [c]
cs)
  Char Char
'f'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\f', [c]
cs)
  Char Char
'n'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\n', [c]
cs)
  Char Char
'r'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\r', [c]
cs)
  Char Char
't'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\t', [c]
cs)
  Char Char
'v'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\v', [c]
cs)
  Char Char
'\\' : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\\', [c]
cs)
  Char Char
'"'  : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\"', [c]
cs)
  Char Char
'\'' : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
'\'', [c]
cs)
  -- escape codes
  Char Char
'x' : [c]
cs -> (Char -> Bool)
-> Int -> (Char -> Int) -> [c] -> Either (c, LexErr) (Char, [c])
forall {a}.
HasChar a =>
(Char -> Bool)
-> Int -> (Char -> Int) -> [a] -> Either (a, LexErr) (Char, [a])
parseNum Char -> Bool
is_hexdigit Int
16 Char -> Int
hexDigit [c]
cs
  Char Char
'o' : [c]
cs -> (Char -> Bool)
-> Int -> (Char -> Int) -> [c] -> Either (c, LexErr) (Char, [c])
forall {a}.
HasChar a =>
(Char -> Bool)
-> Int -> (Char -> Int) -> [a] -> Either (a, LexErr) (Char, [a])
parseNum Char -> Bool
is_octdigit Int
8 Char -> Int
octDecDigit [c]
cs
  cs :: [c]
cs@(Char Char
c : [c]
_) | Char -> Bool
is_decdigit Char
c -> (Char -> Bool)
-> Int -> (Char -> Int) -> [c] -> Either (c, LexErr) (Char, [c])
forall {a}.
HasChar a =>
(Char -> Bool)
-> Int -> (Char -> Int) -> [a] -> Either (a, LexErr) (Char, [a])
parseNum Char -> Bool
is_decdigit Int
10 Char -> Int
octDecDigit [c]
cs
  -- control characters (e.g. '\^M')
  Char Char
'^' : Char Char
c : [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'@', [c]
cs)
  -- long form escapes (e.g. '\NUL')
  [c]
cs | Just (Char
esc, [c]
cs') <- [c] -> Maybe (Char, [c])
forall c. HasChar c => [c] -> Maybe (Char, [c])
parseLongEscape [c]
cs -> (Char, [c]) -> Either (c, LexErr) (Char, [c])
forall a. a -> Either (c, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char
esc, [c]
cs')
  -- shouldn't happen
  Char Char
c : [c]
_ -> String -> Either (c, LexErr) (Char, [c])
forall a. HasCallStack => String -> a
panic (String -> Either (c, LexErr) (Char, [c]))
-> String -> Either (c, LexErr) (Char, [c])
forall a b. (a -> b) -> a -> b
$ String
"found unexpected escape character: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Char -> String
forall a. Show a => a -> String
show Char
c
  [] -> String -> Either (c, LexErr) (Char, [c])
forall a. HasCallStack => String -> a
panic String
"escape character unexpectedly ended"
  where
    parseNum :: (Char -> Bool)
-> Int -> (Char -> Int) -> [a] -> Either (a, LexErr) (Char, [a])
parseNum Char -> Bool
isDigit Int
base Char -> Int
toDigit =
      let go :: Int -> [a] -> Either (a, LexErr) (Char, [a])
go Int
x = \case
            ch :: a
ch@(Char Char
c) : [a]
cs | Char -> Bool
isDigit Char
c -> do
              let x' :: Int
x' = Int
x Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
base Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
toDigit Char
c
              Bool -> Either (a, LexErr) () -> Either (a, LexErr) ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
x' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0x10ffff) (Either (a, LexErr) () -> Either (a, LexErr) ())
-> Either (a, LexErr) () -> Either (a, LexErr) ()
forall a b. (a -> b) -> a -> b
$ (a, LexErr) -> Either (a, LexErr) ()
forall a b. a -> Either a b
Left (a
ch, LexErr
LexNumEscapeRange)
              Int -> [a] -> Either (a, LexErr) (Char, [a])
go Int
x' [a]
cs
            [a]
cs -> (Char, [a]) -> Either (a, LexErr) (Char, [a])
forall a. a -> Either (a, LexErr) a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int -> Char
chr Int
x, [a]
cs)
       in Int -> [a] -> Either (a, LexErr) (Char, [a])
go Int
0

parseLongEscape :: HasChar c => [c] -> Maybe (Char, [c])
parseLongEscape :: forall c. HasChar c => [c] -> Maybe (Char, [c])
parseLongEscape [c]
cs = [(Char, [c])] -> Maybe (Char, [c])
forall a. [a] -> Maybe a
listToMaybe (((String, Char) -> Maybe (Char, [c]))
-> [(String, Char)] -> [(Char, [c])]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe (String, Char) -> Maybe (Char, [c])
tryParse [(String, Char)]
longEscapeCodes)
  where
    tryParse :: (String, Char) -> Maybe (Char, [c])
tryParse (String
code, Char
esc) =
      case Int -> [c] -> ([c], [c])
forall a. Int -> [a] -> ([a], [a])
splitAt (String -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length String
code) [c]
cs of
        ([c]
pre, [c]
cs') | (c -> Char) -> [c] -> String
forall a b. (a -> b) -> [a] -> [b]
map c -> Char
forall c. HasChar c => c -> Char
getChar [c]
pre String -> String -> Bool
forall a. Eq a => a -> a -> Bool
== String
code -> (Char, [c]) -> Maybe (Char, [c])
forall a. a -> Maybe a
Just (Char
esc, [c]
cs')
        ([c], [c])
_ -> Maybe (Char, [c])
forall a. Maybe a
Nothing

    longEscapeCodes :: [(String, Char)]
longEscapeCodes =
      [ (String
"NUL", Char
'\NUL')
      , (String
"SOH", Char
'\SOH')
      , (String
"STX", Char
'\STX')
      , (String
"ETX", Char
'\ETX')
      , (String
"EOT", Char
'\EOT')
      , (String
"ENQ", Char
'\ENQ')
      , (String
"ACK", Char
'\ACK')
      , (String
"BEL", Char
'\BEL')
      , (String
"BS" , Char
'\BS' )
      , (String
"HT" , Char
'\HT' )
      , (String
"LF" , Char
'\LF' )
      , (String
"VT" , Char
'\VT' )
      , (String
"FF" , Char
'\FF' )
      , (String
"CR" , Char
'\CR' )
      , (String
"SO" , Char
'\SO' )
      , (String
"SI" , Char
'\SI' )
      , (String
"DLE", Char
'\DLE')
      , (String
"DC1", Char
'\DC1')
      , (String
"DC2", Char
'\DC2')
      , (String
"DC3", Char
'\DC3')
      , (String
"DC4", Char
'\DC4')
      , (String
"NAK", Char
'\NAK')
      , (String
"SYN", Char
'\SYN')
      , (String
"ETB", Char
'\ETB')
      , (String
"CAN", Char
'\CAN')
      , (String
"EM" , Char
'\EM' )
      , (String
"SUB", Char
'\SUB')
      , (String
"ESC", Char
'\ESC')
      , (String
"FS" , Char
'\FS' )
      , (String
"GS" , Char
'\GS' )
      , (String
"RS" , Char
'\RS' )
      , (String
"US" , Char
'\US' )
      , (String
"SP" , Char
'\SP' )
      , (String
"DEL", Char
'\DEL')
      ]

-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)

isDoubleSmartQuote :: Char -> Bool
isDoubleSmartQuote :: Char -> Bool
isDoubleSmartQuote = \case
  Char
'“' -> Bool
True
  Char
'”' -> Bool
True
  Char
_ -> Bool
False

isSingleSmartQuote :: Char -> Bool
isSingleSmartQuote :: Char -> Bool
isSingleSmartQuote = \case
  Char
'‘' -> Bool
True
  Char
'’' -> Bool
True
  Char
_ -> Bool
False

-- -----------------------------------------------------------------------------
-- Multiline strings

-- | See Note [Multiline string literals]
--
-- Assumes string is lexically valid. Skips the steps about splitting
-- and rejoining lines, and instead manually find newline characters,
-- for performance.
lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
lexMultilineString = (String -> Either (Char, LexErr) String)
-> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
-> Int
-> StringBuffer
-> Either StringLexError String
lexStringWith String -> Either (Char, LexErr) String
forall c. HasChar c => [c] -> Either (c, LexErr) [c]
processChars [CharPos] -> Either (CharPos, LexErr) [CharPos]
forall c. HasChar c => [c] -> Either (c, LexErr) [c]
processChars
  where
    processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
    processChars :: forall c. HasChar c => [c] -> Either (c, LexErr) [c]
processChars =
          [c] -> [c]
forall c. HasChar c => [c] -> [c]
collapseGaps             -- Step 1
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> [c]
forall c. HasChar c => [c] -> [c]
normalizeEOL
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> [c]
forall c. HasChar c => [c] -> [c]
expandLeadingTabs        -- Step 3
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> [c]
forall c. HasChar c => [c] -> [c]
rmCommonWhitespacePrefix -- Step 4
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> [c]
forall c. HasChar c => [c] -> [c]
collapseOnlyWsLines      -- Step 5
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> [c]
forall c. HasChar c => [c] -> [c]
rmFirstNewline           -- Step 7a
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> [c]
forall c. HasChar c => [c] -> [c]
rmLastNewline            -- Step 7b
      ([c] -> [c])
-> ([c] -> Either (c, LexErr) [c]) -> [c] -> Either (c, LexErr) [c]
forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> [c] -> Either (c, LexErr) [c]
forall c. HasChar c => [c] -> Either (c, LexErr) [c]
resolveEscapes           -- Step 8

    -- Normalize line endings to LF. The spec dictates that lines should be
    -- split on newline characters and rejoined with ``\n``. But because we
    -- aren't actually splitting/rejoining, we'll manually normalize here
    normalizeEOL :: HasChar c => [c] -> [c]
    normalizeEOL :: forall c. HasChar c => [c] -> [c]
normalizeEOL =
      let go :: [c] -> [c]
go = \case
            Char Char
'\r' : c :: c
c@(Char Char
'\n') : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
            c :: c
c@(Char Char
'\r') : [c]
cs -> Char -> c -> c
forall c. HasChar c => Char -> c -> c
setChar Char
'\n' c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
            c :: c
c@(Char Char
'\f') : [c]
cs -> Char -> c -> c
forall c. HasChar c => Char -> c -> c
setChar Char
'\n' c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
            c
c : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
            [] -> []
       in [c] -> [c]
go

    -- expands all tabs, since the lexer will verify that tabs can only appear
    -- as leading indentation
    expandLeadingTabs :: HasChar c => [c] -> [c]
    expandLeadingTabs :: forall c. HasChar c => [c] -> [c]
expandLeadingTabs =
      let go :: Int -> [c] -> [c]
go !Int
col = \case
            c :: c
c@(Char Char
'\t') : [c]
cs ->
              let fill :: Int
fill = Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
- (Int
col Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
8)
               in Int -> c -> [c]
forall a. Int -> a -> [a]
replicate Int
fill (Char -> c -> c
forall c. HasChar c => Char -> c -> c
setChar Char
' ' c
c) [c] -> [c] -> [c]
forall a. [a] -> [a] -> [a]
++ Int -> [c] -> [c]
go (Int
col Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
fill) [c]
cs
            c
c : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: Int -> [c] -> [c]
go (if c -> Char
forall c. HasChar c => c -> Char
getChar c
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\n' then Int
0 else Int
col Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) [c]
cs
            [] -> []
       in Int -> [c] -> [c]
forall {c}. HasChar c => Int -> [c] -> [c]
go Int
0

    rmCommonWhitespacePrefix :: HasChar c => [c] -> [c]
    rmCommonWhitespacePrefix :: forall c. HasChar c => [c] -> [c]
rmCommonWhitespacePrefix [c]
cs0 =
      let commonWSPrefix :: Int
commonWSPrefix = String -> Int
getCommonWsPrefix ((c -> Char) -> [c] -> String
forall a b. (a -> b) -> [a] -> [b]
map c -> Char
forall c. HasChar c => c -> Char
getChar [c]
cs0)
          go :: [c] -> [c]
go = \case
            c :: c
c@(Char Char
'\n') : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go (Int -> [c] -> [c]
forall {t} {c}. (Ord t, Num t, HasChar c) => t -> [c] -> [c]
dropLine Int
commonWSPrefix [c]
cs)
            c
c : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
            [] -> []
          -- drop x characters from the string, or up to a newline, whichever
          -- comes first
          dropLine :: t -> [c] -> [c]
dropLine !t
x = \case
            [c]
cs | t
x t -> t -> Bool
forall a. Ord a => a -> a -> Bool
<= t
0 -> [c]
cs
            cs :: [c]
cs@(Char Char
'\n' : [c]
_) -> [c]
cs
            c
_ : [c]
cs -> t -> [c] -> [c]
dropLine (t
x t -> t -> t
forall a. Num a => a -> a -> a
- t
1) [c]
cs
            [] -> []
       in [c] -> [c]
go [c]
cs0

    collapseOnlyWsLines :: HasChar c => [c] -> [c]
    collapseOnlyWsLines :: forall c. HasChar c => [c] -> [c]
collapseOnlyWsLines =
      let go :: [c] -> [c]
go = \case
            c :: c
c@(Char Char
'\n') : [c]
cs | Just [c]
cs' <- [c] -> Maybe [c]
checkAllWs [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs'
            c
c : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
            [] -> []
          checkAllWs :: [c] -> Maybe [c]
checkAllWs = \case
            -- got all the way to a newline or the end of the string, return
            cs :: [c]
cs@(Char Char
'\n' : [c]
_) -> [c] -> Maybe [c]
forall a. a -> Maybe a
Just [c]
cs
            cs :: [c]
cs@[] -> [c] -> Maybe [c]
forall a. a -> Maybe a
Just [c]
cs
            -- found whitespace, continue
            Char Char
c : [c]
cs | Char -> Bool
is_space Char
c -> [c] -> Maybe [c]
checkAllWs [c]
cs
            -- anything else, stop
            [c]
_ -> Maybe [c]
forall a. Maybe a
Nothing
       in [c] -> [c]
go

    rmFirstNewline :: HasChar c => [c] -> [c]
    rmFirstNewline :: forall c. HasChar c => [c] -> [c]
rmFirstNewline = \case
      Char Char
'\n' : [c]
cs -> [c]
cs
      [c]
cs -> [c]
cs

    rmLastNewline :: HasChar c => [c] -> [c]
    rmLastNewline :: forall c. HasChar c => [c] -> [c]
rmLastNewline =
      let go :: [c] -> [c]
go = \case
            [] -> []
            [Char Char
'\n'] -> []
            c
c : [c]
cs -> c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
: [c] -> [c]
go [c]
cs
       in [c] -> [c]
go

-- | See step 4 in Note [Multiline string literals]
--
-- Assumes tabs have already been expanded.
getCommonWsPrefix :: String -> Int
getCommonWsPrefix :: String -> Int
getCommonWsPrefix String
s =
  case [String] -> Maybe (NonEmpty String)
forall a. [a] -> Maybe (NonEmpty a)
NonEmpty.nonEmpty [String]
includedLines of
    Maybe (NonEmpty String)
Nothing -> Int
0
    Just NonEmpty String
ls -> NonEmpty Int -> Int
forall a. Ord a => NonEmpty a -> a
forall (t :: * -> *) a. (Foldable1 t, Ord a) => t a -> a
Foldable1.minimum (NonEmpty Int -> Int) -> NonEmpty Int -> Int
forall a b. (a -> b) -> a -> b
$ (String -> Int) -> NonEmpty String -> NonEmpty Int
forall a b. (a -> b) -> NonEmpty a -> NonEmpty b
NonEmpty.map (String -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length (String -> Int) -> (String -> String) -> String -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> String -> String
forall a. (a -> Bool) -> [a] -> [a]
takeWhile Char -> Bool
is_space) NonEmpty String
ls
  where
    includedLines :: [String]
includedLines =
        (String -> Bool) -> [String] -> [String]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (String -> Bool) -> String -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Char -> Bool
is_space) -- ignore whitespace-only lines
      ([String] -> [String])
-> ([String] -> [String]) -> [String] -> [String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [String] -> [String]
forall a. Int -> [a] -> [a]
drop Int
1                      -- ignore first line in calculation
      ([String] -> [String]) -> [String] -> [String]
forall a b. (a -> b) -> a -> b
$ String -> [String]
lines String
s

{-
Note [Multiline string literals]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Multiline string literals were added following the acceptance of the
proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569

Multiline string literals are syntax sugar for normal string literals,
with an extra post processing step. This all happens in the Lexer; that
is, HsMultilineString will contain the post-processed string. This matches
the same behavior as HsString, which contains the normalized string
(see Note [Literal source text]).

The canonical steps for post processing a multiline string are:
1. Collapse string gaps
2. Split the string by newlines
3. Convert leading tabs into spaces
    * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
4. Remove common whitespace prefix in every line except the first (see below)
5. If a line contains only whitespace, remove all of the whitespace
6. Join the string back with `\n` delimiters
7a. If the first character of the string is a newline, remove it
7b. If the last character of the string is a newline, remove it
8. Interpret escaped characters

The common whitespace prefix can be informally defined as "The longest
prefix of whitespace shared by all lines in the string, excluding the
first line and any whitespace-only lines".

It's more precisely defined with the following algorithm:

1. Take a list representing the lines in the string
2. Ignore the following elements in the list:
    * The first line (we want to ignore everything before the first newline)
    * Empty lines
    * Lines with only whitespace characters
3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
-}

-- -----------------------------------------------------------------------------
-- DList

newtype DList a = DList ([a] -> [a])

dlistEmpty :: DList a
dlistEmpty :: forall a. DList a
dlistEmpty = ([a] -> [a]) -> DList a
forall a. ([a] -> [a]) -> DList a
DList [a] -> [a]
forall a. a -> a
id

dlistToList :: DList a -> [a]
dlistToList :: forall a. DList a -> [a]
dlistToList (DList [a] -> [a]
f) = [a] -> [a]
f []

dlistSnoc :: DList a -> a -> DList a
dlistSnoc :: forall a. DList a -> a -> DList a
dlistSnoc (DList [a] -> [a]
f) a
x = ([a] -> [a]) -> DList a
forall a. ([a] -> [a]) -> DList a
DList ([a] -> [a]
f ([a] -> [a]) -> ([a] -> [a]) -> [a] -> [a]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a
x a -> [a] -> [a]
forall a. a -> [a] -> [a]
:))