[Add HtmlParse module, version 1.7 David Fox **20071205135503] { addfile ./Ugly/HtmlParser.hs hunk ./Ugly/HtmlParser.hs 1 +-- |Use HaXml to parse HTML and then convert the result +-- back to the XHtml Html data type. +module HtmlParser + ( parse + ) where + +import qualified Text.XML.HaXml.Types as H +--import Text.XML.HaXml.Pretty +import qualified Text.XML.HaXml.Html.Parse as H +import qualified Text.XHtml.Transitional as X +import System.IO.Unsafe +import Control.Exception + +-- |The HaXml HTML parser calls error if the parse fails, so we +-- need to run it in the IO monad to catch that error. +parse :: String -> X.Html +parse text = unsafePerformIO (parseM text) + +-- The HaXml data type: +-- data Document = Document Prolog (SymTab EntityDef) Element [Misc] +-- data Element = Elem Name [Attribute] [Content] +-- data Content = CElem Element | CString Bool CharData | CRef Reference | CMisc Misc + +parseM :: String -> IO X.Html +parseM text = + do + -- The $! is required to force the parse error to occur before + -- we test for it. That's why they call it "unsafe IO" I guess. + result <- try . return $! H.htmlParse "text" (addHtml text) + case result of + Right (H.Document _ _ elem _) -> return . X.toHtml . stripHtml . H.CElem $ elem + Left _ -> return . X.stringToHtml $ text + where + addHtml :: String -> String + addHtml x = "" ++ x ++ "" + stripHtml :: H.Content -> H.Content + stripHtml (H.CElem (H.Elem "html" _ [content])) = stripHtml content + stripHtml x = x + +instance X.HTML H.Content where + toHtml = contentToHtml + where + contentToHtml :: H.Content -> X.Html + contentToHtml (H.CElem element) = elementToHtml element + contentToHtml (H.CString _bool chardata) = X.primHtml chardata + contentToHtml (H.CRef (H.RefEntity er)) = X.primHtmlChar er + contentToHtml (H.CRef (H.RefChar ch)) = X.primHtml ("#" ++ show ch) + contentToHtml (H.CMisc (H.Comment _comment)) = X.noHtml + contentToHtml (H.CMisc (H.PI (_pitarget, _string))) = X.noHtml + elementToHtml :: H.Element -> X.Html + elementToHtml (H.Elem name [] []) = X.itag name + elementToHtml (H.Elem name attrs []) = X.itag name X.! map attrsToHtml (attrs :: [(H.Name, H.AttValue)]) + elementToHtml (H.Elem name [] content) = X.tag name (X.concatHtml (map contentToHtml content)) + elementToHtml (H.Elem name attrs content) = X.tag name (X.concatHtml (map contentToHtml content)) X.! map attrsToHtml attrs + attrsToHtml :: (H.Name, H.AttValue) -> X.HtmlAttr + attrsToHtml (name, H.AttValue attValue) = X.strAttr name (concat (map attValueToString attValue)) + attValueToString (Left string) = string + attValueToString (Right (H.RefEntity er)) = "&" ++ er ++ ";" + attValueToString (Right (H.RefChar ch)) = "&#" ++ show ch ++ ";" + +-- For debugging +instance Show H.Document where + show (H.Document a b c d) = "(H.Document " ++ show a ++ " " ++ show b ++ " " ++ show c ++ " " ++ show d ++ ")" +instance Show H.Prolog where + show _ = "" +instance Show H.EntityDef where + show _ = "" +instance Show H.Element where + show (H.Elem a b c) = "(H.Elem " ++ " " ++ show a ++ " " ++ show b ++ " " ++ show c ++ ")" +instance Show H.AttValue where + show (H.AttValue x) = "(H.AttValue " ++ show x ++ ")" +instance Show H.Content where + show (H.CElem x) = "(H.CElem " ++ show x ++ ")" + show (H.CString a b) = "(H.CString " ++ show a ++ " " ++ show b ++ ")" + show (H.CRef r) = "(H.CRef " ++ show r ++ ")" + show (H.CMisc x) = "(H.CMisc " ++ show x ++ ")" +instance Show H.Reference where + show (H.RefEntity e) = "(H.RefEntity " ++ show e ++ ")" + show (H.RefChar r) = "(H.RefChar " ++ show r ++ ")" +instance Show H.Misc where + show (H.Comment c) = "(H.Comment " ++ show c ++ ")" + show (H.PI p) = "(H.PI " ++ show p ++ ")" hunk ./Ugly.cabal 17 - Ugly.HtmlForm, Ugly.Image, Ugly.Traverse, Ugly.URI, + Ugly.HtmlForm, Ugly.HtmlParse, Ugly.Traverse, Ugly.URI, Ugly.Image, hunk ./debian/changelog 5 + * Add the Ugly.HtmlParse module, which uses HaXml to parse HTML + and then converts it back to the Text.XHtml data type. }