module GHC.Cmm.Pipeline (
  cmmPipeline
) where

import GHC.Prelude

import GHC.Driver.Flags

import GHC.Cmm
import GHC.Cmm.Config
import GHC.Cmm.ContFlowOpt
import GHC.Cmm.CommonBlockElim
import GHC.Cmm.Dataflow.Label
import GHC.Cmm.Info.Build
import GHC.Cmm.Lint
import GHC.Cmm.LayoutStack
import GHC.Cmm.ProcPoint
import GHC.Cmm.Sink
import GHC.Cmm.Switch.Implement
import GHC.Cmm.ThreadSanitizer

import GHC.Types.Unique.Supply
import GHC.Types.Unique.DSM

import GHC.Utils.Error
import GHC.Utils.Logger
import GHC.Utils.Outputable
import GHC.Utils.Misc ( partitionWith )

import GHC.Platform

import Control.Monad
import GHC.Utils.Monad (mapAccumLM)

-----------------------------------------------------------------------------
-- | Top level driver for C-- pipeline
-----------------------------------------------------------------------------

-- | Converts C-- with an implicit stack and native C-- calls into
-- optimized, CPS converted and native-call-less C--.  The latter
-- C-- can be used to generate assembly.
cmmPipeline
 :: Logger
 -> CmmConfig
 -> ModuleSRTInfo        -- Info about SRTs generated so far
 -> CmmGroup             -- Input C-- with Procedures
 -> DUniqSupply
 -> IO ((ModuleSRTInfo, CmmGroupSRTs), DUniqSupply) -- Output CPS transformed C--

cmmPipeline :: Logger
-> CmmConfig
-> ModuleSRTInfo
-> CmmGroup
-> DUniqSupply
-> IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply)
cmmPipeline Logger
logger CmmConfig
cmm_config ModuleSRTInfo
srtInfo CmmGroup
prog DUniqSupply
dus0 = do
  let forceRes :: ((a, t a), a) -> ()
forceRes ((a
info, t a
group), a
us) = a
info a -> () -> ()
forall a b. a -> b -> b
`seq` a
us a -> () -> ()
forall a b. a -> b -> b
`seq` (a -> () -> ()) -> () -> t a -> ()
forall a b. (a -> b -> b) -> b -> t a -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr a -> () -> ()
forall a b. a -> b -> b
seq () t a
group
  let platform :: Platform
platform = CmmConfig -> Platform
cmmPlatform CmmConfig
cmm_config
  Logger
-> SDoc
-> (((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply) -> ())
-> IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply)
-> IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply)
forall (m :: * -> *) a.
MonadIO m =>
Logger -> SDoc -> (a -> ()) -> m a -> m a
withTimingSilent Logger
logger (String -> SDoc
forall doc. IsLine doc => String -> doc
text String
"Cmm pipeline") ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply) -> ()
forall {t :: * -> *} {a} {a} {a}. Foldable t => ((a, t a), a) -> ()
forceRes (IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply)
 -> IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply))
-> IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply)
-> IO ((ModuleSRTInfo, [CmmDeclSRTs]), DUniqSupply)
forall a b. (a -> b) -> a -> b
$ do
     (dus1, prog')  <- {-# SCC "tops" #-} (DUniqSupply
 -> CmmDecl
 -> IO
      (DUniqSupply,
       Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl)))
-> DUniqSupply
-> CmmGroup
-> IO
     (DUniqSupply,
      [Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl)])
forall (m :: * -> *) (t :: * -> *) acc x y.
(Monad m, Traversable t) =>
(acc -> x -> m (acc, y)) -> acc -> t x -> m (acc, t y)
mapAccumLM (Logger
-> Platform
-> CmmConfig
-> DUniqSupply
-> CmmDecl
-> IO
     (DUniqSupply,
      Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl))
cpsTop Logger
logger Platform
platform CmmConfig
cmm_config) DUniqSupply
dus0 CmmGroup
prog
     let (procs, data_) = partitionWith id prog'
     (srtInfo, dus, cmms) <- {-# SCC "doSRTs" #-} doSRTs cmm_config srtInfo dus1 procs data_
     dumpWith logger Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (pdoc platform cmms)

     return ((srtInfo, cmms), dus)

-- | The Cmm pipeline for a single 'CmmDecl'. Returns:
--
--   - in the case of a 'CmmProc': 'Left' of the resulting (possibly
--     proc-point-split) 'CmmDecl's and their 'CafEnv'. CAF analysis
--     necessarily happens *before* proc-point splitting, as described in Note
--     [SRTs].
--
--   - in the case of a `CmmData`, the unmodified 'CmmDecl' and a 'CAFSet' containing
cpsTop :: Logger -> Platform -> CmmConfig -> DUniqSupply -> CmmDecl -> IO (DUniqSupply, Either (CAFEnv, [CmmDecl]) (CAFSet, CmmDataDecl))
cpsTop :: Logger
-> Platform
-> CmmConfig
-> DUniqSupply
-> CmmDecl
-> IO
     (DUniqSupply,
      Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl))
cpsTop Logger
_logger Platform
platform CmmConfig
_ DUniqSupply
dus (CmmData Section
section GenCmmStatics 'False
statics) =
  (DUniqSupply,
 Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl))
-> IO
     (DUniqSupply,
      Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl))
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (DUniqSupply
dus, (Set CAFfyLabel, CmmDataDecl)
-> Either (CAFEnv, CmmGroup) (Set CAFfyLabel, CmmDataDecl)
forall a b. b -> Either a b
Right (Platform -> GenCmmStatics 'False -> Set CAFfyLabel
cafAnalData Platform
platform GenCmmStatics 'False
statics, Section -> GenCmmStatics 'False -> CmmDataDecl
forall d h g. Section -> d -> GenCmmDecl d h g
CmmData Section
section GenCmmStatics 'False
statics))
cpsTop Logger
logger Platform
platform CmmConfig
cfg DUniqSupply
dus CmmDecl
proc =
    do
      ----------- Control-flow optimisations ----------------------------------

      -- The first round of control-flow optimisation speeds up the
      -- later passes by removing lots of empty blocks, so we do it
      -- even when optimisation isn't turned on.
      --
      CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
           CmmDecl -> IO CmmDecl
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (CmmDecl -> IO CmmDecl) -> CmmDecl -> IO CmmDecl
forall a b. (a -> b) -> a -> b
$ Bool -> CmmDecl -> CmmDecl
cmmCfgOptsProc Bool
splitting_proc_points CmmDecl
proc
      dump Opt_D_dump_cmm_cfg "Post control-flow optimisations (1)" g

      let !TopInfo {stack_info=StackInfo { arg_space = entry_off
                                         , do_layout = do_layout }} = h

      ----------- Eliminate common blocks -------------------------------------
      g <- {-# SCC "elimCommonBlocks" #-}
           condPass (cmmOptElimCommonBlks cfg) elimCommonBlocks g
                         Opt_D_dump_cmm_cbe "Post common block elimination"

      -- Any work storing block Labels must be performed _after_
      -- elimCommonBlocks

      ----------- Implement switches ------------------------------------------
      (g, dus) <- if cmmDoCmmSwitchPlans cfg
             then {-# SCC "createSwitchPlans" #-}
                  pure $ runUniqueDSM dus $ cmmImplementSwitchPlans platform g
             else pure (g, dus)
      dump Opt_D_dump_cmm_switch "Post switch plan" g

      ----------- ThreadSanitizer instrumentation -----------------------------
      g <- {-# SCC "annotateTSAN" #-}
          if cmmOptThreadSanitizer cfg
          then do
             -- TODO(#25273): Use the deterministic UniqDSM (ie `runUniqueDSM`) instead
             -- of UniqSM (see `initUs_`) to guarantee deterministic objects
             -- when doing thread sanitization.
            us <- mkSplitUniqSupply 'u'
            return $ initUs_ us $
              annotateTSAN platform g
          else return g
      dump Opt_D_dump_cmm_thread_sanitizer "ThreadSanitizer instrumentation" g

      ----------- Proc points -------------------------------------------------
      let
        call_pps :: ProcPointSet -- LabelMap
        call_pps = {-# SCC "callProcPoints" #-} CmmGraph -> ProcPointSet
callProcPoints CmmGraph
g
      (proc_points, dus) <-
         if splitting_proc_points
            then do
              let (pp, dus') = {-# SCC "minimalProcPointSet" #-} runUniqueDSM dus $
                    minimalProcPointSet platform call_pps g
              dumpWith logger Opt_D_dump_cmm_proc "Proc points"
                    FormatCMM (pdoc platform l $$ ppr pp $$ pdoc platform g)
              return (pp, dus')
            else
              return (call_pps, dus)

      ----------- Layout the stack and manifest Sp ----------------------------
      ((g, stackmaps), dus) <- pure $
         {-# SCC "layoutStack" #-}
         if do_layout
            then runUniqueDSM dus $ cmmLayoutStack cfg proc_points entry_off g
            else ((g, mapEmpty), dus)
      dump Opt_D_dump_cmm_sp "Layout Stack" g

      ----------- Sink and inline assignments  --------------------------------
      g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
           condPass (cmmOptSink cfg) (cmmSink platform) g
                    Opt_D_dump_cmm_sink "Sink assignments"

      ------------- CAF analysis ----------------------------------------------
      let cafEnv = {-# SCC "cafAnal" #-} Platform -> ProcPointSet -> CLabel -> CmmGraph -> CAFEnv
cafAnal Platform
platform ProcPointSet
call_pps CLabel
l CmmGraph
g
      dumpWith logger Opt_D_dump_cmm_caf "CAFEnv" FormatText (pdoc platform cafEnv)

      (g, dus) <- if splitting_proc_points
           then do
             ------------- Split into separate procedures -----------------------
             let pp_map = {-# SCC "procPointAnalysis" #-}
                          ProcPointSet -> CmmGraph -> LabelMap Status
procPointAnalysis ProcPointSet
proc_points CmmGraph
g
             dumpWith logger Opt_D_dump_cmm_procmap "procpoint map"
                FormatCMM (ppr pp_map)
             (g, dus) <- {-# SCC "splitAtProcPoints" #-} pure $ runUniqueDSM dus $
                  splitAtProcPoints platform l call_pps proc_points pp_map
                                    (CmmProc h l v g)
             dumps Opt_D_dump_cmm_split "Post splitting" g
             return (g, dus)
           else
             -- attach info tables to return points
             return ([attachContInfoTables call_pps (CmmProc h l v g)], dus)

      ------------- Populate info tables with stack info -----------------
      g <- {-# SCC "setInfoTableStackMap" #-}
           return $ map (setInfoTableStackMap platform stackmaps) g
      dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g

      ----------- Control-flow optimisations -----------------------------
      g <- {-# SCC "cmmCfgOpts(2)" #-}
           return $ if cmmOptControlFlow cfg
                    then map (cmmCfgOptsProc splitting_proc_points) g
                    else g
      g <- return $ map (removeUnreachableBlocksProc platform) g
           -- See Note [unreachable blocks]
      dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations (2)" g

      return (dus, Left (cafEnv, g))

  where dump :: DumpFlag -> String -> CmmGraph -> IO ()
dump = Logger
-> Platform -> Bool -> DumpFlag -> String -> CmmGraph -> IO ()
dumpGraph Logger
logger Platform
platform (CmmConfig -> Bool
cmmDoLinting CmmConfig
cfg)

        dumps :: DumpFlag -> String -> CmmGroup -> IO ()
dumps DumpFlag
flag String
name
           = (CmmDecl -> IO ()) -> CmmGroup -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
dumpWith Logger
logger DumpFlag
flag String
name DumpFormat
FormatCMM (SDoc -> IO ()) -> (CmmDecl -> SDoc) -> CmmDecl -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Platform -> CmmDecl -> SDoc
forall env a. OutputableP env a => env -> a -> SDoc
pdoc Platform
platform)

        condPass :: Bool
-> (CmmGraph -> CmmGraph)
-> CmmGraph
-> DumpFlag
-> String
-> IO CmmGraph
condPass Bool
do_opt CmmGraph -> CmmGraph
pass CmmGraph
g DumpFlag
dumpflag String
dumpname =
            if Bool
do_opt
               then do
                    g <- CmmGraph -> IO CmmGraph
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (CmmGraph -> IO CmmGraph) -> CmmGraph -> IO CmmGraph
forall a b. (a -> b) -> a -> b
$ CmmGraph -> CmmGraph
pass CmmGraph
g
                    dump dumpflag dumpname g
                    return g
               else CmmGraph -> IO CmmGraph
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return CmmGraph
g

        -- we don't need to split proc points for the NCG, unless
        -- tablesNextToCode is off.  The latter is because we have no
        -- label to put on info tables for basic blocks that are not
        -- the entry point.
        splitting_proc_points :: Bool
splitting_proc_points = CmmConfig -> Bool
cmmSplitProcPoints CmmConfig
cfg

-- Note [Sinking after stack layout]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- In the past we considered running sinking pass also before stack
-- layout, but after making some measurements we realized that:
--
--   a) running sinking only before stack layout produces slower
--      code than running sinking only before stack layout
--
--   b) running sinking both before and after stack layout produces
--      code that has the same performance as when running sinking
--      only after stack layout.
--
-- In other words sinking before stack layout doesn't buy as anything.
--
-- An interesting question is "why is it better to run sinking after
-- stack layout"? It seems that the major reason are stores and loads
-- generated by stack layout. Consider this code before stack layout:
--
--  c1E:
--      _c1C::P64 = R3;
--      _c1B::P64 = R2;
--      _c1A::P64 = R1;
--      I64[(young<c1D> + 8)] = c1D;
--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
--  c1D:
--      R3 = _c1C::P64;
--      R2 = _c1B::P64;
--      R1 = _c1A::P64;
--      call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
--
-- Stack layout pass will save all local variables live across a call
-- (_c1C, _c1B and _c1A in this example) on the stack just before
-- making a call and reload them from the stack after returning from a
-- call:
--
--  c1E:
--      _c1C::P64 = R3;
--      _c1B::P64 = R2;
--      _c1A::P64 = R1;
--      I64[Sp - 32] = c1D;
--      P64[Sp - 24] = _c1A::P64;
--      P64[Sp - 16] = _c1B::P64;
--      P64[Sp - 8] = _c1C::P64;
--      Sp = Sp - 32;
--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
--  c1D:
--      _c1A::P64 = P64[Sp + 8];
--      _c1B::P64 = P64[Sp + 16];
--      _c1C::P64 = P64[Sp + 24];
--      R3 = _c1C::P64;
--      R2 = _c1B::P64;
--      R1 = _c1A::P64;
--      Sp = Sp + 32;
--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
--
-- If we don't run sinking pass after stack layout we are basically
-- left with such code. However, running sinking on this code can lead
-- to significant improvements:
--
--  c1E:
--      I64[Sp - 32] = c1D;
--      P64[Sp - 24] = R1;
--      P64[Sp - 16] = R2;
--      P64[Sp - 8] = R3;
--      Sp = Sp - 32;
--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
--  c1D:
--      R3 = P64[Sp + 24];
--      R2 = P64[Sp + 16];
--      R1 = P64[Sp + 8];
--      Sp = Sp + 32;
--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
--
-- Now we only have 9 assignments instead of 15.
--
-- There is one case when running sinking before stack layout could
-- be beneficial. Consider this:
--
--   L1:
--      x = y
--      call f() returns L2
--   L2: ...x...y...
--
-- Since both x and y are live across a call to f, they will be stored
-- on the stack during stack layout and restored after the call:
--
--   L1:
--      x = y
--      P64[Sp - 24] = L2
--      P64[Sp - 16] = x
--      P64[Sp - 8]  = y
--      Sp = Sp - 24
--      call f() returns L2
--   L2:
--      y = P64[Sp + 16]
--      x = P64[Sp + 8]
--      Sp = Sp + 24
--      ...x...y...
--
-- However, if we run sinking before stack layout we would propagate x
-- to its usage place (both x and y must be local register for this to
-- be possible - global registers cannot be floated past a call):
--
--   L1:
--      x = y
--      call f() returns L2
--   L2: ...y...y...
--
-- Thus making x dead at the call to f(). If we ran stack layout now
-- we would generate less stores and loads:
--
--   L1:
--      x = y
--      P64[Sp - 16] = L2
--      P64[Sp - 8]  = y
--      Sp = Sp - 16
--      call f() returns L2
--   L2:
--      y = P64[Sp + 8]
--      Sp = Sp + 16
--      ...y...y...
--
-- But since we don't see any benefits from running sinking before stack
-- layout, this situation probably doesn't arise too often in practice.
--

{- Note [inconsistent-pic-reg]
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~
On x86/Darwin, PIC is implemented by inserting a sequence like

    call 1f
 1: popl %reg

at the proc entry point, and then referring to labels as offsets from
%reg.  If we don't split proc points, then we could have many entry
points in a proc that would need this sequence, and each entry point
would then get a different value for %reg.  If there are any join
points, then at the join point we don't have a consistent value for
%reg, so we don't know how to refer to labels.

Hence, on x86/Darwin, we have to split proc points, and then each proc
point will get its own PIC initialisation sequence.

This isn't an issue on x86/ELF, where the sequence is

    call 1f
 1: popl %reg
    addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg

so %reg always has a consistent value: the address of
_GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.

-}

{- Note [unreachable blocks]
   ~~~~~~~~~~~~~~~~~~~~~~~~~
The control-flow optimiser sometimes leaves unreachable blocks behind
containing junk code.  These aren't necessarily a problem, but
removing them is good because it might save time in the native code
generator later.

-}

dumpGraph :: Logger -> Platform -> Bool -> DumpFlag -> String -> CmmGraph -> IO ()
dumpGraph :: Logger
-> Platform -> Bool -> DumpFlag -> String -> CmmGraph -> IO ()
dumpGraph Logger
logger Platform
platform Bool
do_linting DumpFlag
flag String
name CmmGraph
g = do
  Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
do_linting (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ CmmGraph -> IO ()
do_lint CmmGraph
g
  Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
dumpWith Logger
logger DumpFlag
flag String
name DumpFormat
FormatCMM (Platform -> CmmGraph -> SDoc
forall env a. OutputableP env a => env -> a -> SDoc
pdoc Platform
platform CmmGraph
g)
 where
  do_lint :: CmmGraph -> IO ()
do_lint CmmGraph
g = case Platform -> CmmGraph -> Maybe SDoc
cmmLintGraph Platform
platform CmmGraph
g of
                 Just SDoc
err -> do { Logger -> SDoc -> IO ()
fatalErrorMsg Logger
logger SDoc
err
                                ; Logger -> ByteOff -> IO ()
ghcExit Logger
logger ByteOff
1
                                }
                 Maybe SDoc
Nothing  -> () -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ()

dumpWith :: Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
dumpWith :: Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
dumpWith Logger
logger DumpFlag
flag String
txt DumpFormat
fmt SDoc
sdoc = do
  Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
putDumpFileMaybe Logger
logger DumpFlag
flag String
txt DumpFormat
fmt SDoc
sdoc
  Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Bool -> Bool
not (Logger -> DumpFlag -> Bool
logHasDumpFlag Logger
logger DumpFlag
flag)) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
    -- If `-ddump-cmm-verbose -ddump-to-file` is specified,
    -- dump each Cmm pipeline stage output to a separate file.  #16930
    Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Logger -> DumpFlag -> Bool
logHasDumpFlag Logger
logger DumpFlag
Opt_D_dump_cmm_verbose)
      (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Logger
-> PprStyle -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
logDumpFile Logger
logger (NamePprCtx -> PprStyle
mkDumpStyle NamePprCtx
alwaysQualify) DumpFlag
flag String
txt DumpFormat
fmt SDoc
sdoc
  Logger -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
putDumpFileMaybe Logger
logger DumpFlag
Opt_D_dump_cmm_verbose_by_proc String
txt DumpFormat
fmt SDoc
sdoc