-- |Use HaXml to parse HTML and then convert the result -- back to the XHtml Html data type. module HtmlParser ( parse , parseEither ) where import qualified Text.XML.HaXml.Types as H --import Text.XML.HaXml.Pretty import qualified Text.XML.HaXml.Html.Parse as H import qualified Text.XHtml.Transitional as X import System.IO.Unsafe import Control.Exception -- |The HaXml HTML parser calls error if the parse fails, so we -- need to run it in the IO monad to catch that error. parse :: String -> X.Html parse text = unsafePerformIO (parseM text) parseEither :: String -> Either SomeException X.Html parseEither text = unsafePerformIO (parseEitherM text) parseM :: String -> IO X.Html parseM text = parseEitherM text >>= either (return . X.stringToHtml . show) return -- The HaXml data type: -- data Document = Document Prolog (SymTab EntityDef) Element [Misc] -- data Element = Elem Name [Attribute] [Content] -- data Content = CElem Element | CString Bool CharData | CRef Reference | CMisc Misc parseEitherM :: String -> IO (Either SomeException X.Html) parseEitherM text = do -- The evaluate is required to force the parse error to occur before -- we test for it. That's why they call it "unsafe IO" I guess. result <- try . evaluate . H.htmlParse "text" . addHtml $ text case result of Right (H.Document _ _ elem _) -> return . Right . X.toHtml . stripHtml . H.CElem $ elem Left x -> return (Left x) where addHtml :: String -> String addHtml x = "" ++ x ++ "" stripHtml :: H.Content -> H.Content stripHtml (H.CElem (H.Elem "html" _ [content])) = stripHtml content stripHtml x = x instance X.HTML H.Content where toHtml = contentToHtml where contentToHtml :: H.Content -> X.Html contentToHtml (H.CElem element) = elementToHtml element contentToHtml (H.CString _bool chardata) = X.primHtml chardata contentToHtml (H.CRef (H.RefEntity er)) = X.primHtmlChar er contentToHtml (H.CRef (H.RefChar ch)) = X.primHtml ("#" ++ show ch) contentToHtml (H.CMisc (H.Comment _comment)) = X.noHtml contentToHtml (H.CMisc (H.PI (_pitarget, _string))) = X.noHtml elementToHtml :: H.Element -> X.Html elementToHtml (H.Elem name [] []) = X.itag name elementToHtml (H.Elem name attrs []) = X.itag name X.! map attrsToHtml (attrs :: [(H.Name, H.AttValue)]) elementToHtml (H.Elem name [] content) = X.tag name (X.concatHtml (map contentToHtml content)) elementToHtml (H.Elem name attrs content) = X.tag name (X.concatHtml (map contentToHtml content)) X.! map attrsToHtml attrs attrsToHtml :: (H.Name, H.AttValue) -> X.HtmlAttr attrsToHtml (name, H.AttValue attValue) = X.strAttr name (concat (map attValueToString attValue)) attValueToString (Left string) = string attValueToString (Right (H.RefEntity er)) = "&" ++ er ++ ";" attValueToString (Right (H.RefChar ch)) = "&#" ++ show ch ++ ";" -- For debugging instance Show H.Document where show (H.Document a b c d) = "(H.Document " ++ show a ++ " " ++ show b ++ " " ++ show c ++ " " ++ show d ++ ")" instance Show H.Prolog where show _ = "" instance Show H.EntityDef where show _ = "" instance Show H.Element where show (H.Elem a b c) = "(H.Elem " ++ " " ++ show a ++ " " ++ show b ++ " " ++ show c ++ ")" instance Show H.AttValue where show (H.AttValue x) = "(H.AttValue " ++ show x ++ ")" instance Show H.Content where show (H.CElem x) = "(H.CElem " ++ show x ++ ")" show (H.CString a b) = "(H.CString " ++ show a ++ " " ++ show b ++ ")" show (H.CRef r) = "(H.CRef " ++ show r ++ ")" show (H.CMisc x) = "(H.CMisc " ++ show x ++ ")" instance Show H.Reference where show (H.RefEntity e) = "(H.RefEntity " ++ show e ++ ")" show (H.RefChar r) = "(H.RefChar " ++ show r ++ ")" instance Show H.Misc where show (H.Comment c) = "(H.Comment " ++ show c ++ ")" show (H.PI p) = "(H.PI " ++ show p ++ ")"