-- |Use HaXml to parse HTML and then convert the result
-- back to the XHtml Html data type.
module HtmlParser
    ( parse
    , parseEither
    ) where

import qualified Text.XML.HaXml.Types as H
--import Text.XML.HaXml.Pretty
import qualified Text.XML.HaXml.Html.Parse as H
import qualified Text.XHtml.Transitional as X
import System.IO.Unsafe
import Control.Exception

-- |The HaXml HTML parser calls error if the parse fails, so we
-- need to run it in the IO monad to catch that error.
parse :: String -> X.Html
parse text = unsafePerformIO (parseM text)

parseEither :: String -> Either Exception X.Html
parseEither text = unsafePerformIO (parseEitherM text)

parseM :: String -> IO X.Html
parseM text = parseEitherM text >>= either (return . X.stringToHtml . show) return

-- The HaXml data type:
-- data Document = Document Prolog (SymTab EntityDef) Element [Misc]
-- data Element = Elem Name [Attribute] [Content]
-- data Content = CElem Element | CString Bool CharData | CRef Reference | CMisc Misc

parseEitherM :: String -> IO (Either Exception X.Html)
parseEitherM text =
    do 
      -- The evaluate is required to force the parse error to occur before
      -- we test for it.  That's why they call it "unsafe IO" I guess.
      result <- try . evaluate . H.htmlParse "text" . addHtml $ text
      case result of
        Right (H.Document _ _ elem _) -> return . Right . X.toHtml . stripHtml . H.CElem $ elem
        Left x -> return (Left x)
    where
      addHtml :: String -> String
      addHtml x = "<html>" ++ x ++ "</html>"
      stripHtml :: H.Content -> H.Content
      stripHtml (H.CElem (H.Elem "html" _ [content])) = stripHtml content
      stripHtml x = x

instance X.HTML H.Content where
    toHtml = contentToHtml
        where
          contentToHtml :: H.Content -> X.Html
          contentToHtml (H.CElem element) = elementToHtml element
          contentToHtml (H.CString _bool chardata) = X.primHtml chardata
          contentToHtml (H.CRef (H.RefEntity er)) = X.primHtmlChar er
          contentToHtml (H.CRef (H.RefChar ch)) = X.primHtml ("#" ++ show ch)
          contentToHtml (H.CMisc (H.Comment _comment)) = X.noHtml
          contentToHtml (H.CMisc (H.PI (_pitarget, _string))) = X.noHtml
          elementToHtml :: H.Element -> X.Html
          elementToHtml (H.Elem name [] []) = X.itag name
          elementToHtml (H.Elem name attrs []) = X.itag name X.! map attrsToHtml (attrs :: [(H.Name, H.AttValue)])
          elementToHtml (H.Elem name [] content) = X.tag name (X.concatHtml (map contentToHtml content))
          elementToHtml (H.Elem name attrs content) = X.tag name (X.concatHtml (map contentToHtml content)) X.! map attrsToHtml attrs
          attrsToHtml :: (H.Name, H.AttValue) -> X.HtmlAttr
          attrsToHtml (name, H.AttValue attValue) = X.strAttr name (concat (map attValueToString attValue))
          attValueToString (Left string) = string
          attValueToString (Right (H.RefEntity er)) = "&" ++ er ++ ";"
          attValueToString (Right (H.RefChar ch)) = "&#" ++ show ch ++ ";"

-- For debugging
instance Show H.Document where
    show (H.Document a b c d) = "(H.Document " ++ show a ++ " " ++ show b ++ " " ++ show c ++ " " ++ show d ++ ")"
instance Show H.Prolog where
    show _ = "<H.Prolog>"
instance Show H.EntityDef where
    show _ = "<H.EntityDef>"
instance Show H.Element where
    show (H.Elem a b c) = "(H.Elem " ++ " " ++ show a ++ " " ++  show b ++ " " ++  show c ++ ")"
instance Show H.AttValue where
    show (H.AttValue x) = "(H.AttValue " ++ show x ++ ")"
instance Show H.Content where
    show (H.CElem x) = "(H.CElem " ++ show x ++ ")"
    show (H.CString a b) = "(H.CString " ++ show a ++ " " ++ show b ++ ")"
    show (H.CRef r) = "(H.CRef " ++ show r ++ ")"
    show (H.CMisc x) = "(H.CMisc " ++ show x ++ ")"
instance Show H.Reference where
    show (H.RefEntity e) = "(H.RefEntity " ++ show e ++ ")"
    show (H.RefChar r) = "(H.RefChar " ++ show r ++ ")"
instance Show H.Misc where
    show (H.Comment c) = "(H.Comment " ++ show c ++ ")"
    show (H.PI p) = "(H.PI " ++ show p ++ ")"

