Package: ghc Version: 7.6.3-19 Severity: wishlist Tags: patch upstream Hi,
As part of the "reproducible builds" project[1], it was discovered that ghc's interface hashes include the timestamp of dependent files[2]. This makes the compilation of Haskell packages unreproducible. Fortunately, upstream have fixed this[3] and provided a backport for 7.6.3[4]. I tested this: I applied the patch locally, rebuilt ghc, and tested that it is able to deterministically compile haskell-lrucache. Indeed, it was! It'd be much appreciated if this was applied to 7.6.3, which would affect > 300 Haskell packages that are currently not reproducible. [1]: https://wiki.debian.org/ReproducibleBuilds [2]: https://jenkins.debian.net/userContent/index_FTBR_with_buildinfo.html [3]: https://ghc.haskell.org/trac/ghc/ticket/8144 [4]: https://github.com/nh2/ghc/commit/3d61ee6c3e79913de2400aba8cecde8dbf44fc67 -- Tom Fitzhenry [email protected] -- System Information: Debian Release: jessie/sid APT prefers testing-updates APT policy: (500, 'testing-updates'), (500, 'testing') Architecture: amd64 (x86_64) Kernel: Linux 3.16.0-4-amd64 (SMP w/3 CPU cores) Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/dash Versions of packages ghc depends on: ii gcc 4:4.9.1-5 ii libbsd-dev 0.7.0-2 ii libc6 2.19-13 ii libc6-dev 2.19-13 ii libffi-dev 3.1-2 ii libffi6 3.1-2 ii libgmp-dev 2:6.0.0+dfsg-6 ii libgmp10 2:6.0.0+dfsg-6 ii libtinfo5 5.9+20140913-1 ghc recommends no packages. Versions of packages ghc suggests: ii ghc-doc 7.6.3-19 ii ghc-prof 7.6.3-19 pn haskell-doc <none> pn llvm <none> ii perl 5.20.1-2 -- debconf-show failed
>From 3d61ee6c3e79913de2400aba8cecde8dbf44fc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= <[email protected]> Date: Thu, 22 Aug 2013 11:05:56 +0900 Subject: [PATCH] Backport from 7.8: Fix interface hashes including time stamp of dependent files. Fixes #8144. Before, the modification time of e.g. #included files (and everything that ends up as a UsageFile, e.g. via addDependentFile) was taken as input for the interface hash of a module. This lead to different hashes for identical inputs on every compilation. We now use file content hashes instead. This changes the interface file format. You will get "Binary.get(Usage): 50" when you try to do an incremental using .hi files that were created with a GHC 7.7 (only) older than this commit. To calculate the md5 hash (`Fingerprint`) of a file in constant space, there now is GHC.Fingerprint.getFileHash, and a fallback version for older GHCs that needs to load the file into memory completely (only used when compiling stage1 with an older GHC). --- compiler/iface/BinIface.hs | 6 +++--- compiler/iface/MkIface.lhs | 29 +++++++++++++++-------------- compiler/main/HscTypes.lhs | 2 +- compiler/utils/Fingerprint.hsc | 34 +++++++++++++++++++++++++++++++++- libraries/base/GHC/Fingerprint.hs | 3 +++ 5 files changed, 55 insertions(+), 19 deletions(-) diff --git a/compiler/iface/BinIface.hs b/compiler/iface/BinIface.hs index 201e7bb..5a461a9 100644 --- a/compiler/iface/BinIface.hs +++ b/compiler/iface/BinIface.hs @@ -629,7 +629,7 @@ instance Binary Usage where put_ bh usg@UsageFile{} = do putByte bh 2 put_ bh (usg_file_path usg) - put_ bh (usg_mtime usg) + put_ bh (usg_file_hash usg) get bh = do h <- getByte bh @@ -649,8 +649,8 @@ instance Binary Usage where usg_exports = exps, usg_entities = ents, usg_safe = safe } 2 -> do fp <- get bh - mtime <- get bh - return UsageFile { usg_file_path = fp, usg_mtime = mtime } + hash <- get bh + return UsageFile { usg_file_path = fp, usg_file_hash = hash } i -> error ("Binary.get(Usage): " ++ show i) instance Binary Warnings where diff --git a/compiler/iface/MkIface.lhs b/compiler/iface/MkIface.lhs index cb9d32d..60d2a3f 100644 --- a/compiler/iface/MkIface.lhs +++ b/compiler/iface/MkIface.lhs @@ -879,17 +879,18 @@ mkOrphMap get_key decls \begin{code} mkUsageInfo :: HscEnv -> Module -> ImportedMods -> NameSet -> [FilePath] -> IO [Usage] mkUsageInfo hsc_env this_mod dir_imp_mods used_names dependent_files - = do { eps <- hscEPS hsc_env - ; mtimes <- mapM getModificationUTCTime dependent_files - ; let mod_usages = mk_mod_usage_info (eps_PIT eps) hsc_env this_mod - dir_imp_mods used_names - ; let usages = mod_usages ++ map to_file_usage (zip dependent_files mtimes) - ; usages `seqList` return usages } - -- seq the list of Usages returned: occasionally these - -- don't get evaluated for a while and we can end up hanging on to - -- the entire collection of Ifaces. - where - to_file_usage (f, mtime) = UsageFile { usg_file_path = f, usg_mtime = mtime } + = do + eps <- hscEPS hsc_env + hashes <- mapM getFileHash dependent_files + let mod_usages = mk_mod_usage_info (eps_PIT eps) hsc_env this_mod + dir_imp_mods used_names + let usages = mod_usages ++ [ UsageFile { usg_file_path = f + , usg_file_hash = hash } + | (f, hash) <- zip dependent_files hashes ] + usages `seqList` return usages + -- seq the list of Usages returned: occasionally these + -- don't get evaluated for a while and we can end up hanging on to + -- the entire collection of Ifaces. mk_mod_usage_info :: PackageIfaceTable -> HscEnv @@ -1338,11 +1339,11 @@ checkModUsage this_pkg UsageHomeModule{ checkModUsage _this_pkg UsageFile{ usg_file_path = file, - usg_mtime = old_mtime } = + usg_file_hash = old_hash } = liftIO $ handleIO handle $ do - new_mtime <- getModificationUTCTime file - if (old_mtime /= new_mtime) + new_hash <- getFileHash file + if (old_hash /= new_hash) then return recomp else return UpToDate where diff --git a/compiler/main/HscTypes.lhs b/compiler/main/HscTypes.lhs index 7c1f169..f8abb21 100644 --- a/compiler/main/HscTypes.lhs +++ b/compiler/main/HscTypes.lhs @@ -1645,7 +1645,7 @@ data Usage } -- ^ Module from the current package | UsageFile { usg_file_path :: FilePath, - usg_mtime :: UTCTime + usg_file_hash :: Fingerprint -- ^ External file dependency. From a CPP #include or TH addDependentFile. Should be absolute. } deriving( Eq ) diff --git a/compiler/utils/Fingerprint.hsc b/compiler/utils/Fingerprint.hsc index 735bf23..9527809 100644 --- a/compiler/utils/Fingerprint.hsc +++ b/compiler/utils/Fingerprint.hsc @@ -12,7 +12,8 @@ module Fingerprint ( Fingerprint(..), fingerprint0, readHexFingerprint, fingerprintData, - fingerprintString + fingerprintString, + getFileHash ) where #include "md5.h" @@ -20,8 +21,12 @@ module Fingerprint ( import Outputable +import Control.Monad ( when ) import Text.Printf import Numeric ( readHex ) +import Foreign +import Panic +import System.IO ##if __GLASGOW_HASKELL__ >= 701 -- The MD5 implementation is now in base, to support Typeable @@ -102,3 +107,30 @@ readHexFingerprint s = Fingerprint w1 w2 [(w1,"")] = readHex s1 [(w2,"")] = readHex (take 16 s2) + +-- Only use this if we're smaller than GHC 7.7, otherwise +-- GHC.Fingerprint exports a better version of this function. + +-- | Computes the hash of a given file. +-- It loads the full file into memory an does not work with files bigger than +-- MAXINT. +getFileHash :: FilePath -> IO Fingerprint +getFileHash path = withBinaryFile path ReadMode $ \h -> do + + fileSize <- toIntFileSize `fmap` hFileSize h + + allocaBytes fileSize $ \bufPtr -> do + n <- hGetBuf h bufPtr fileSize + when (n /= fileSize) readFailedError + fingerprintData bufPtr fileSize + + where + toIntFileSize :: Integer -> Int + toIntFileSize size + | size > fromIntegral (maxBound :: Int) = throwGhcException $ + Sorry $ "Fingerprint.getFileHash: Tried to calculate hash of file " + ++ path ++ " with size > maxBound :: Int. This is not supported." + | otherwise = fromIntegral size + + readFailedError = throwGhcException $ + Panic $ "Fingerprint.getFileHash: hGetBuf failed on interface file" diff --git a/libraries/base/GHC/Fingerprint.hs b/libraries/base/GHC/Fingerprint.hs index d1b3831..7a4e635 100644 --- a/libraries/base/GHC/Fingerprint.hs +++ b/libraries/base/GHC/Fingerprint.hs @@ -26,8 +26,11 @@ import GHC.Base import GHC.Num import GHC.List import GHC.Real +import GHC.Show import Foreign import Foreign.C +import System.IO +import Control.Monad (when) import GHC.Fingerprint.Type -- 2.1.3

