switch to using hashes from Data.Hashable

The only thing one might worry about is more probable hash collisions (since hashable uses Int values for hashes, we only get 64 (or 32) bit hashes as opposed to the 128 bits we were getting with MD5. I have included a comment giving some rough ballpark estimates for the probability of collisions. I think in practice it is not a concern, except perhaps when building thousands of diagrams simultaneously on a 32-bit system.
diagrams · Nov 28, 2013 · c93a65b · c93a65b
1 parent efe65a8
commit c93a65b
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 109 deletions.
diff --git a/diagrams-builder.cabal b/diagrams-builder.cabal
@@ -56,10 +56,9 @@ library
                        transformers >= 0.3 && < 0.4,
                        split >= 0.2 && < 0.3,
                        haskell-src-exts >= 1.13.1 && < 1.15,
-                       cryptohash >= 0.8 && < 0.12,
-                       bytestring >= 0.9.2 && < 0.11,
                        cmdargs >= 0.6 && < 0.11,
-                       lens >= 3.9 && < 3.11
+                       lens >= 3.9 && < 3.11,
+                       hashable >= 1.2 && < 1.3
   hs-source-dirs:      src
   default-language:    Haskell2010
   other-extensions:    StandaloneDeriving,

diff --git a/src/Diagrams/Builder.hs b/src/Diagrams/Builder.hs
@@ -23,13 +23,13 @@ module Diagrams.Builder
          -- ** Options
          BuildOpts(..), mkBuildOpts, backendOpts, snippets, pragmas, imports, decideRegen, diaExpr, postProcess
 
+         -- ** Regeneration decision functions
+       , alwaysRegenerate, hashedRegenerate
+
          -- ** Building
        , buildDiagram, BuildResult(..)
        , ppInterpError
 
-         -- ** Regeneration decision functions
-       , alwaysRegenerate, hashedRegenerate
-
          -- * Interpreting diagrams
          -- $interp
        , setDiagramImports
@@ -46,16 +46,12 @@ import           Control.Lens                        ((^.))
 import           Control.Monad                       (guard, mplus, mzero)
 import           Control.Monad.Error                 (catchError)
 import           Control.Monad.Trans.Maybe           (MaybeT, runMaybeT)
-import           Crypto.Hash                         (Digest, MD5,
-                                                      digestToHexByteString,
-                                                      hash)
-import qualified Data.ByteString.Char8               as B
-import           Data.List                           (nub)
+import           Data.Hashable                       (Hashable (..))
+import           Data.List                           (foldl', nub)
 import           Data.List.Split                     (splitOn)
 import           Data.Maybe                          (catMaybes, fromMaybe)
 import           Data.Typeable                       (Typeable)
 import           System.Directory                    (doesFileExist,
-                                                      getDirectoryContents,
                                                       getTemporaryDirectory,
                                                       removeFile)
 import           System.FilePath                     (takeBaseName, (<.>),
@@ -121,11 +117,11 @@ getHsenvArgv = do
 --   source file, using some backend to produce a result.  The
 --   expression can be of type @Diagram b v@ or @IO (Diagram b v)@.
 interpretDiagram
-  :: forall b v x.
+  :: forall b v.
      ( Typeable b, Typeable v
      , InnerSpace v, OrderedField (Scalar v), Backend b v
      )
-  => BuildOpts b v x
+  => BuildOpts b v
   -> FilePath
   -> IO (Either InterpreterError (Result b v))
 interpretDiagram bopts m = do
@@ -158,15 +154,14 @@ ppInterpError (GhcException err) = "GhcException: " ++ err
 ------------------------------------------------------------
 
 -- | Potential results of a dynamic diagram building operation.
-data BuildResult b v x =
+data BuildResult b v =
     ParseErr  String              -- ^ Parsing of the code failed.
   | InterpErr InterpreterError    -- ^ Interpreting the code
                                   --   failed. See 'ppInterpError'.
-  | Skipped x                     -- ^ This diagram did not need to be
-                                  --   regenerated.
-  | OK x (Result b v)             -- ^ A successful build, yielding a
-                                  --   backend-specific result and
-                                  --   some extra information.
+  | Skipped Hash                  -- ^ This diagram did not need to be
+                                  --   regenerated; includes the hash.
+  | OK Hash (Result b v)          -- ^ A successful build, yielding the
+                                  --   hash and a backend-specific result.
 
 -- | Build a diagram by writing the given source code to a temporary
 --   module and interpreting the given expression, which can be of
@@ -176,9 +171,9 @@ data BuildResult b v x =
 buildDiagram
   :: ( Typeable b, Typeable v
      , InnerSpace v, OrderedField (Scalar v), Backend b v
-     , Show (Options b v)
+     , Hashable (Options b v)
      )
-  => BuildOpts b v x -> IO (BuildResult b v x)
+  => BuildOpts b v -> IO (BuildResult b v)
 buildDiagram bopts = do
   let bopts' = bopts
              & snippets %~ map unLit
@@ -187,11 +182,16 @@ buildDiagram bopts = do
   case createModule Nothing bopts' of
     Left  err -> return (ParseErr err)
     Right m@(Module _ _ _ _ _ srcImps _) -> do
-      liHashes <- getLocalImportHashes srcImps
-      regen    <- (bopts ^. decideRegen) (prettyPrint m ++ (bopts ^. diaExpr) ++ show (bopts ^. backendOpts) ++ concat liHashes)
+      liHash <- hashLocalImports srcImps
+      let diaHash
+            = 0 `hashWithSalt` prettyPrint m
+                `hashWithSalt` (bopts ^. diaExpr)
+                `hashWithSalt` (bopts ^. backendOpts)
+                `hashWithSalt` liHash
+      regen    <- (bopts ^. decideRegen) diaHash
       case regen of
-        (info, Nothing)  -> return $ Skipped info
-        (info, Just upd) -> do
+        Nothing  -> return $ Skipped diaHash
+        Just upd -> do
           tmpDir   <- getTemporaryDirectory
           (tmp, h) <- openTempFile tmpDir ("Diagram.hs")
           let m' = replaceModuleName (takeBaseName tmp) m
@@ -200,17 +200,16 @@ buildDiagram bopts = do
 
           compilation <- interpretDiagram (bopts' & backendOpts %~ upd) tmp
           removeFile tmp
-          return $ either InterpErr (OK info) compilation
+          return $ either InterpErr (OK diaHash) compilation
 
--- | Take a list of imports, and return hashes of the contents of
+-- | Take a list of imports, and return a hash of the contents of
 --   those imports which are local.  Note, this only finds imports
 --   which exist relative to the current directory, which is not as
 --   general as it probably should be --- we could be calling
 --   'buildDiagram' on source code which lives anywhere.
-getLocalImportHashes :: [ImportDecl] -> IO [String]
-getLocalImportHashes
-  = (fmap . map) hashStr
-  . fmap catMaybes
+hashLocalImports :: [ImportDecl] -> IO Hash
+hashLocalImports
+  = fmap (foldl' hashWithSalt 0 . catMaybes)
   . mapM getLocalSource
   . map (foldr1 (</>) . splitOn "." . getModuleName . importModule)
 
@@ -234,46 +233,3 @@ getLocal m = tryExt "hs" `mplus` tryExt "lhs"
     tryExt ext = do
       let f = m <.> ext
       liftIO (doesFileExist f) >>= guard >> liftIO (readFile f)
-
--- | Convenience function suitable to be given as the final argument
---   to 'buildDiagram'.  It implements the simple policy of always
---   rebuilding every diagram.
-alwaysRegenerate :: String -> IO ((), Maybe (a -> a))
-alwaysRegenerate _ = return ((), Just id)
-
--- | Convenience function suitable to be given as the final argument
---   to 'buildDiagram'.  It works by hashing the given diagram source,
---   and looking in the specified directory for any file whose base
---   name is equal to the hash.  If there is such a file, it specifies
---   that the diagram should not be rebuilt.  Otherwise, it specifies
---   that the diagram should be rebuilt, and uses the provided
---   function to update the rendering options based on the generated
---   hash.  (Most likely, one would want to set the requested output
---   file to the hash followed by some extension.)  It also returns
---   the generated hash.
-hashedRegenerate
-  :: (String -> a -> a)
-     -- ^ A function for computing an update to rendering options,
-     --   given a new base filename computed from a hash of the
-     --   diagram source.
-
-  -> FilePath
-     -- ^ The directory in which to look for generated files
-
-  -> String
-     -- ^ The \"source\" to hash. Note that this does not actually
-     -- have to be valid source code.  A common trick is to
-     -- concatenate the actual source code with String representations
-     -- of any other information on which the diagram depends.
-
-  -> IO (String, Maybe (a -> a))
-
-hashedRegenerate upd d src = do
-  let fileBase = hashStr src
-  files <- getDirectoryContents d
-  case any ((fileBase==) . takeBaseName) files of
-    True  -> return (fileBase, Nothing)
-    False -> return (fileBase, Just (upd fileBase))
-
-hashStr :: String -> String
-hashStr = B.unpack . digestToHexByteString . (hash :: B.ByteString -> Digest MD5) . B.pack
diff --git a/src/Diagrams/Builder/Modules.hs b/src/Diagrams/Builder/Modules.hs
@@ -35,7 +35,7 @@ import           Diagrams.Builder.Opts
 --   Returns the updated module, or an error message if parsing
 --   failed.
 createModule :: Maybe String -- ^ Module name to use
-             -> BuildOpts b v x
+             -> BuildOpts b v
              -> Either String Module
 createModule nm opts = do
   ms <- mapM doModuleParse (opts ^. snippets)

diff --git a/src/Diagrams/Builder/Opts.hs b/src/Diagrams/Builder/Opts.hs
@@ -17,16 +17,36 @@
 --
 -----------------------------------------------------------------------------
 module Diagrams.Builder.Opts
-    (
-      BuildOpts(..), mkBuildOpts, backendOpts, snippets, pragmas, imports, decideRegen, diaExpr, postProcess
+    ( -- * Options
+
+      Hash
+    , BuildOpts(..), mkBuildOpts, backendOpts, snippets, pragmas, imports, decideRegen, diaExpr, postProcess
+
+      -- * Rebuilding
+
+    , alwaysRegenerate, hashedRegenerate, hashToHexStr
     )
     where
 
-import           Control.Lens     (Lens, Lens', generateSignatures, lensRules,
+import           Control.Lens     (Lens', generateSignatures, lensRules,
                                    makeLensesWith, (&), (.~))
+import           System.Directory (getDirectoryContents)
+import           System.FilePath  (takeBaseName)
+import           Text.Printf
 
 import           Diagrams.Prelude (Diagram, Options)
 
+-- | Synonym for more perspicuous types.
+--
+--   We use @Int@ values for hashes because that's what the @Hashable@
+--   package uses.  Assuming diagram hashes are uniformly distributed,
+--   on a 64-bit system one needs to build on the order of billions of
+--   diagrams before the probability of a hash collision exceeds 1/2,
+--   and for anything up to tens of millions of diagrams the
+--   probability of a collision is under 0.1%.  On 32-bit systems
+--   those become tens of thousands and thousands, respectively.
+type Hash = Int
+
 -- | Options to control the behavior of @buildDiagram@.  Create one
 --   with 'mkBuildOpts' followed by using the provided lenses to
 --   override more fields; for example,
@@ -36,7 +56,7 @@ import           Diagrams.Prelude (Diagram, Options)
 --     & imports .~ [\"Foo.Bar\", \"Baz.Quux\"]
 --     & diaExpr .~ \"square 6 # fc green\"
 -- @
-data BuildOpts b v x
+data BuildOpts b v
   = BuildOpts
     { backendToken :: b
       -- ^ Backend token
@@ -46,11 +66,13 @@ data BuildOpts b v x
     , _snippets    :: [String]
     , _pragmas     :: [String]
     , _imports     :: [String]
-    , _decideRegen :: String -> IO (x, Maybe (Options b v -> Options b v))
+    , _decideRegen :: Hash -> IO (Maybe (Options b v -> Options b v))
     , _diaExpr     :: String
     , _postProcess :: Diagram b v -> Diagram b v
     }
 
+makeLensesWith (lensRules & generateSignatures .~ False) ''BuildOpts
+
 -- | Create a @BuildOpts@ record with default options:
 --
 --   * no snippets
@@ -64,63 +86,102 @@ data BuildOpts b v x
 --   * the diagram expression @circle 1@
 --
 --   * no postprocessing
-mkBuildOpts :: b -> v -> Options b v -> BuildOpts b v ()
+mkBuildOpts :: b -> v -> Options b v -> BuildOpts b v
 mkBuildOpts b v opts
-  = BuildOpts b v opts [] [] [] (const (return ((), Just id))) "circle 1" id
-
-makeLensesWith (lensRules & generateSignatures .~ False) ''BuildOpts
+  = BuildOpts b v opts [] [] [] alwaysRegenerate "circle 1" id
 
 -- | Backend-specific options to use.
-backendOpts :: Lens' (BuildOpts b v x) (Options b v)
+backendOpts :: Lens' (BuildOpts b v) (Options b v)
 
 -- | Source code snippets.  Each should be a syntactically valid
 --   Haskell module.  They will be combined intelligently, /i.e./
 --   not just pasted together textually but combining pragmas,
 --   imports, /etc./ separately.
-snippets :: Lens' (BuildOpts b v x) [String]
+snippets :: Lens' (BuildOpts b v) [String]
 
 -- | Extra @LANGUAGE@ pragmas to use (@NoMonomorphismRestriction@
 --   is automatically enabled.)
-pragmas :: Lens' (BuildOpts b v x) [String]
+pragmas :: Lens' (BuildOpts b v) [String]
 
 -- | Additional module imports (note that "Diagrams.Prelude" is
 --   automatically imported).
-imports :: Lens' (BuildOpts b v x) [String]
-
--- | A function to decide whether a particular diagram needs to
---   be regenerated.  It will be passed the final assembled
---   source for the diagram (but with the module name set to
---   @Main@ instead of something auto-generated, so that hashing
---   the source will produce consistent results across runs). It
---   can return some information (such as a hash of the source)
---   via the @x@ result, which will be passed through to the
---   result of 'buildDiagram'.  More importantly, it decides
---   whether the diagram should be built: a result of 'Just'
---   means the diagram /should/ be built; 'Nothing' means it
---   should not. In the case that it should be built, it returns
---   a function for updating the rendering options.  This could
---   be used, /e.g./, to request a filename based on a hash of
---   the source.
+imports :: Lens' (BuildOpts b v) [String]
+
+-- | A function to decide whether a particular diagram needs to be
+--   regenerated.  It will be passed a hash of the final assembled
+--   source for the diagram (but with the module name set to @Main@
+--   instead of something auto-generated, so that hashing the source
+--   will produce consistent results across runs), plus any options,
+--   local imports, and other things which could affect the result of
+--   rendering. It can return some information (such as a hash of the
+--   source) via the @x@ result, which will be passed through to the
+--   result of 'buildDiagram'.  More importantly, it decides whether
+--   the diagram should be built: a result of 'Just' means the diagram
+--   /should/ be built; 'Nothing' means it should not. In the case
+--   that it should be built, it returns a function for updating the
+--   rendering options.  This could be used, /e.g./, to request a
+--   filename based on a hash of the source.
 --
 --   Two standard decision functions are provided for
 --   convenience: 'alwaysRegenerate' returns no extra information
 --   and always decides to regenerate the diagram;
 --   'hashedRegenerate' creates a hash of the diagram source and
 --   looks for a file with that name in a given directory.
-decideRegen :: Lens (BuildOpts b v x) (BuildOpts b v x')
-                    (String -> IO (x, Maybe (Options b v -> Options b v)))
-                    (String -> IO (x', Maybe (Options b v -> Options b v)))
+decideRegen :: Lens' (BuildOpts b v) (Hash -> IO (Maybe (Options b v -> Options b v)))
 
 -- | The diagram expression to interpret.  All the given import sand
 --   snippets will be in scope, with the given LANGUAGE pragmas
 --   enabled.  The expression may have either of the types @Diagram b
 --   v@ or @IO (Diagram b v)@.
-diaExpr :: Lens' (BuildOpts b v x) String
+diaExpr :: Lens' (BuildOpts b v) String
 
 -- | A function to apply to the interpreted diagram prior to
 --   rendering.  For example, you might wish to apply @pad 1.1
 --   . centerXY@.  This is preferred over directly modifying the
 --   string expression to be interpreted, since it gives better
 --   typechecking, and works no matter whether the expression
 --   represents a diagram or an IO action.
-postProcess :: Lens' (BuildOpts b v x) (Diagram b v -> Diagram b v)
+postProcess :: Lens' (BuildOpts b v) (Diagram b v -> Diagram b v)
+
+-- | Convenience function suitable to be given as the final argument
+--   to 'buildDiagram'.  It implements the simple policy of always
+--   rebuilding every diagram.
+alwaysRegenerate :: Hash -> IO (Maybe (a -> a))
+alwaysRegenerate _ = return (Just id)
+
+-- | Convenience function suitable to be given as the final argument
+--   to 'buildDiagram'.  It works by converting the hash value to a
+--   zero-padded hexadecimal string and looking in the specified
+--   directory for any file whose base name is equal to the hash.  If
+--   there is such a file, it specifies that the diagram should not be
+--   rebuilt.  Otherwise, it specifies that the diagram should be
+--   rebuilt, and uses the provided function to update the rendering
+--   options based on the generated hash string.  (Most likely, one
+--   would want to set the requested output file to the hash followed
+--   by some extension.)
+hashedRegenerate
+  :: (String -> a -> a)
+     -- ^ A function for computing an update to rendering options,
+     --   given a new base filename computed from a hash of the
+     --   diagram source.
+
+  -> FilePath
+     -- ^ The directory in which to look for generated files
+
+  -> Hash
+     -- ^ The hash
+
+  -> IO (Maybe (a -> a))
+
+hashedRegenerate upd d hash = do
+  let fileBase = hashToHexStr hash
+  files <- getDirectoryContents d
+  case any ((fileBase==) . takeBaseName) files of
+    True  -> return Nothing
+    False -> return $ Just (upd fileBase)
+
+hashToHexStr :: Hash -> String
+hashToHexStr n = printf "%016x" n'
+  where
+    n' :: Integer
+    n' = fromIntegral n - fromIntegral (minBound :: Int)