-- |Scan two directories (original and copy) and find all files in the
-- copy directory which are identical to a file in the original
-- directory.  This is done by sorting all the files by size, and then
-- comparing the files in each size group.
module FindCopies
    ( Status(..)
    , copyStatus
    , findPaths
    , showStatus
    , showCounts
    , showCommon
    ) where

import Control.Exception hiding (handle)
import System.Directory
import System.FilePath
import System.IO
import System.Posix.Files
import System.Posix.Types
import Data.List
import Data.Maybe
import qualified Data.ByteString.Lazy as B
--import qualified Data.ByteString as B

-- Lets try not to get these mixed up...
type CopyPath = FilePath
type OrigPath = FilePath

-- |The status of a pair of files
data Status = NoOriginal FilePath			-- ^ Exists only in the copy directory
            | SameContent FilePath FilePath		-- ^ File content is identical
            | SameNameAndContent FilePath FilePath	-- ^ File content and name are identical
            | SamePathAndContent FilePath FilePath	-- ^ File content and (relative) path are identical
            | SameFile FilePath FilePath 		-- ^ Files are hard linked
            | Empty FilePath				-- ^ File is empty
            | SpecialFile FilePath			-- ^ Files is not regular, directory, or symlink
            | SymbolicLink FilePath			-- ^ File is a symbolic link
              deriving Show

data FileInfo = FileInfo { path :: FilePath
                         , stat :: FileStatus
                         , handle :: Handle
                         , text :: B.ByteString }

-- |Examine all the files in two directories, designated original and
-- copy.  For each path in the copy directory, determine which of the
-- statuses given in the Status type above applies to each, and pass
-- that status to a function supplied by the caller.  The designations
-- 'copy' and 'original' are more or less arbitrary, but the idea is
-- that you might want to remove the duplicate files in the copy
-- directory, or hard link them to the files in the original
-- directory.
copyStatus :: (Status -> IO ()) -> FilePath -> FilePath -> IO ()
copyStatus f origTop copyTop =
    do origFiles <- findPaths origTop >>= return . groupFiles
       copyFiles <- findPaths copyTop >>= return . groupFiles
       copyStatus' origTop copyTop f origFiles copyFiles
    where
      -- Group files of equal size.
      groupFiles :: [(FileStatus, FilePath)] -> [(FileOffset, [(FileStatus, FilePath)])]
      groupFiles pairs = map unpair . group . sort $ pairs
      -- Sort the files by size
      sort = sortBy (\ a b -> compare (fileSize (fst b)) (fileSize (fst a)))
      -- Group the files by size
      group = groupBy (\ a b -> fileSize (fst a) == fileSize (fst b))
      -- extract the common file size from each group
      unpair :: [(FileStatus, FilePath)] -> (FileOffset, [(FileStatus, FilePath)])
      unpair [] = error $ "Internal error"
      unpair pairs@((status, _) : _) = (fileSize status, pairs)

-- This helper function processes the list of size groups.
copyStatus' :: FilePath -> FilePath -> (Status -> IO ()) -> [(FileOffset, [(FileStatus, OrigPath)])] -> [(FileOffset, [(FileStatus, CopyPath)])] -> IO ()
copyStatus' _ _ _ _ [] = return ()
copyStatus' _ _ _ [] _ = return ()
copyStatus' origTop copyTop f allOriginals@((originalSize, originals) : nextOriginals) allCopies@((copySize, copies) : nextCopies) =
    case compare originalSize copySize of
      GT -> copyStatus' origTop copyTop f nextOriginals allCopies
      LT -> copyStatus' origTop copyTop f allOriginals nextCopies
      EQ -> compareFiles originals copies >>
            copyStatus' origTop copyTop f nextOriginals nextCopies
    where
      compareFiles originals copies =
          do --hPutStrLn stderr ("Group: " ++ show (map snd originals ++ map snd copies))
             let (originalStatus, originalPaths) = unzip originals
                 (copyStatus, copyPaths) = unzip copies
             originalHandles <- mapM open originalPaths
             originalText <- mapM read originalHandles
             copyHandles <- mapM open copyPaths
             copyText <- mapM read copyHandles
             let originalInfo = map makeInfo . catMaybes . map zapMissing $ zip4 originalPaths originalStatus originalHandles originalText
             let copyInfo = map makeInfo . catMaybes . map zapMissing $ zip4 copyPaths copyStatus copyHandles copyText
             let matches = map (getStatus origTop copyTop originalInfo) $ copyInfo
             mapM_ f matches
             mapM_ (\ (FileInfo {handle=handle}) -> hClose handle) (originalInfo ++ copyInfo)
      open :: FilePath -> IO (Maybe Handle)
      open path = try (openBinaryFile path ReadMode) >>= either (return . const Nothing) (return . Just)
      read :: Maybe Handle -> IO (Maybe B.ByteString)
      read = maybe (return Nothing) (\ h -> B.hGetContents h >>= return . Just)
      zapMissing (a, b, Just c, Just d) = Just (a, b, c, d)
      zapMissing _ = Nothing

makeInfo (p, s, h, t) = FileInfo {path=p, stat=s, handle=h, text=t}

-- |Compare the text of the copy to the text of each original, and return the
-- path pair of the first match.
getStatus :: FilePath -> FilePath -> [FileInfo] -> FileInfo -> Status
getStatus origTop copyTop originalCandidates copyInfo@(FileInfo {path=copyPath}) =
    foldl foldStatus (NoOriginal copyPath) (map (pairStatus copyInfo) originalCandidates)
    where
      pairStatus :: FileInfo -> FileInfo -> Status
      pairStatus (FileInfo {path=copyPath, stat=copyStatus, text=copyText}) (FileInfo {path=origPath, stat=origStatus, text=origText})
          | isSymbolicLink copyStatus = SymbolicLink copyPath
          | isBlockDevice copyStatus || isCharacterDevice copyStatus || isNamedPipe copyStatus || isSocket copyStatus = SpecialFile copyPath
          | copyText == B.empty = Empty copyPath
          | deviceID copyStatus == deviceID origStatus && fileID copyStatus == fileID origStatus = SameFile copyPath origPath
          | True =
              --error ("copyTop=" ++ copyTop ++ ", copyPath=" ++ copyPath ++ ", origTop=" ++ origTop ++ ", origPath=" ++ origPath ++ ", copySuff=" ++ show (dropPrefix copyTop copyPath) ++ ", origSuff=" ++ show (dropPrefix origTop origPath) ++ ", triple=" ++ show (copyText == origText, dropPrefix copyTop copyPath == dropPrefix origTop origPath, takeBaseName copyPath == takeBaseName origPath))
              case (copyText == origText, dropPrefix copyTop copyPath == dropPrefix origTop origPath, takeBaseName copyPath == takeBaseName origPath) of
                (True, True, _) -> SamePathAndContent copyPath origPath
                (True, _ , True) -> SameNameAndContent copyPath origPath
                (True, _, _) -> SameContent copyPath origPath
                _ -> NoOriginal copyPath
      foldStatus :: Status -> Status -> Status
      foldStatus s@(SymbolicLink _) _ = s
      foldStatus s@(SpecialFile _) _ = s
      foldStatus s@(Empty _) _ = s
      foldStatus _ s@(Empty _) = s
      foldStatus s@(SameFile _ _) _ = s
      foldStatus _ s@(SameFile _ _) = s
      -- Keep the strongest comparison
      foldStatus s@(SamePathAndContent _ _) _ = s
      foldStatus _ s@(SamePathAndContent _ _) = s
      foldStatus s@(SameNameAndContent _ _) _ = s
      foldStatus _ s@(SameNameAndContent _ _) = s
      foldStatus s@(SameContent _ _) _ = s
      foldStatus _ s@(SameContent _ _) = s
      foldStatus s _ = s

showStatus :: Status -> IO ()
showStatus status = putStrLn (show status)

showCounts :: Status -> IO ()
showCounts _ = undefined

showCommon :: Status -> IO ()
showCommon x@(SameContent _ _) = putStrLn . show $ x
showCommon x@(SameFile _ _) = putStrLn . show $ x
showCommon x@(SameNameAndContent _ _) = putStrLn . show $ x
showCommon _ = return ()

-- |Find all the non-directory files within a directory.  Returns a
-- list of each file's size and path.
findPaths :: FilePath -> IO [(FileStatus, FilePath)]
findPaths top =
    getSymbolicLinkStatus top >>= flip find ""
    where
      find parentStatus path =
          -- Be very careful not to return a pair where one is a
          -- symlink to the other - you could lose both.
          do status <- getSymbolicLinkStatus (top ++ path)
             -- Don't enter any directory that is a mountpoint.  This
             -- will not detect a mount --bind of a file system onto
             -- itself; neither will mountpoint(1) by the way.
             let mountpoint = deviceID status /= deviceID parentStatus
             case (mountpoint, isDirectory status) of
               (True, _) -> return []
               (_, True) -> do names <- getDirectoryContents (top ++ path) >>= return . filter (not . flip elem [".", ".."])
                               let subdirs = map ((path ++ "/") ++) names
                               mapM (find status) subdirs >>= return . concat
               (_, False) -> return [(status, top ++ path)]

dropPrefix p s =
    case isPrefixOf p s of
      True -> Just (drop (length p) s)
      False -> Nothing

