[Add HtmlParse module, version 1.7
David Fox **20071205135503] {
addfile ./Ugly/HtmlParser.hs
hunk ./Ugly/HtmlParser.hs 1
+-- |Use HaXml to parse HTML and then convert the result
+-- back to the XHtml Html data type.
+module HtmlParser
+ ( parse
+ ) where
+
+import qualified Text.XML.HaXml.Types as H
+--import Text.XML.HaXml.Pretty
+import qualified Text.XML.HaXml.Html.Parse as H
+import qualified Text.XHtml.Transitional as X
+import System.IO.Unsafe
+import Control.Exception
+
+-- |The HaXml HTML parser calls error if the parse fails, so we
+-- need to run it in the IO monad to catch that error.
+parse :: String -> X.Html
+parse text = unsafePerformIO (parseM text)
+
+-- The HaXml data type:
+-- data Document = Document Prolog (SymTab EntityDef) Element [Misc]
+-- data Element = Elem Name [Attribute] [Content]
+-- data Content = CElem Element | CString Bool CharData | CRef Reference | CMisc Misc
+
+parseM :: String -> IO X.Html
+parseM text =
+ do
+ -- The $! is required to force the parse error to occur before
+ -- we test for it. That's why they call it "unsafe IO" I guess.
+ result <- try . return $! H.htmlParse "text" (addHtml text)
+ case result of
+ Right (H.Document _ _ elem _) -> return . X.toHtml . stripHtml . H.CElem $ elem
+ Left _ -> return . X.stringToHtml $ text
+ where
+ addHtml :: String -> String
+ addHtml x = "" ++ x ++ ""
+ stripHtml :: H.Content -> H.Content
+ stripHtml (H.CElem (H.Elem "html" _ [content])) = stripHtml content
+ stripHtml x = x
+
+instance X.HTML H.Content where
+ toHtml = contentToHtml
+ where
+ contentToHtml :: H.Content -> X.Html
+ contentToHtml (H.CElem element) = elementToHtml element
+ contentToHtml (H.CString _bool chardata) = X.primHtml chardata
+ contentToHtml (H.CRef (H.RefEntity er)) = X.primHtmlChar er
+ contentToHtml (H.CRef (H.RefChar ch)) = X.primHtml ("#" ++ show ch)
+ contentToHtml (H.CMisc (H.Comment _comment)) = X.noHtml
+ contentToHtml (H.CMisc (H.PI (_pitarget, _string))) = X.noHtml
+ elementToHtml :: H.Element -> X.Html
+ elementToHtml (H.Elem name [] []) = X.itag name
+ elementToHtml (H.Elem name attrs []) = X.itag name X.! map attrsToHtml (attrs :: [(H.Name, H.AttValue)])
+ elementToHtml (H.Elem name [] content) = X.tag name (X.concatHtml (map contentToHtml content))
+ elementToHtml (H.Elem name attrs content) = X.tag name (X.concatHtml (map contentToHtml content)) X.! map attrsToHtml attrs
+ attrsToHtml :: (H.Name, H.AttValue) -> X.HtmlAttr
+ attrsToHtml (name, H.AttValue attValue) = X.strAttr name (concat (map attValueToString attValue))
+ attValueToString (Left string) = string
+ attValueToString (Right (H.RefEntity er)) = "&" ++ er ++ ";"
+ attValueToString (Right (H.RefChar ch)) = "" ++ show ch ++ ";"
+
+-- For debugging
+instance Show H.Document where
+ show (H.Document a b c d) = "(H.Document " ++ show a ++ " " ++ show b ++ " " ++ show c ++ " " ++ show d ++ ")"
+instance Show H.Prolog where
+ show _ = ""
+instance Show H.EntityDef where
+ show _ = ""
+instance Show H.Element where
+ show (H.Elem a b c) = "(H.Elem " ++ " " ++ show a ++ " " ++ show b ++ " " ++ show c ++ ")"
+instance Show H.AttValue where
+ show (H.AttValue x) = "(H.AttValue " ++ show x ++ ")"
+instance Show H.Content where
+ show (H.CElem x) = "(H.CElem " ++ show x ++ ")"
+ show (H.CString a b) = "(H.CString " ++ show a ++ " " ++ show b ++ ")"
+ show (H.CRef r) = "(H.CRef " ++ show r ++ ")"
+ show (H.CMisc x) = "(H.CMisc " ++ show x ++ ")"
+instance Show H.Reference where
+ show (H.RefEntity e) = "(H.RefEntity " ++ show e ++ ")"
+ show (H.RefChar r) = "(H.RefChar " ++ show r ++ ")"
+instance Show H.Misc where
+ show (H.Comment c) = "(H.Comment " ++ show c ++ ")"
+ show (H.PI p) = "(H.PI " ++ show p ++ ")"
hunk ./Ugly.cabal 17
- Ugly.HtmlForm, Ugly.Image, Ugly.Traverse, Ugly.URI,
+ Ugly.HtmlForm, Ugly.HtmlParse, Ugly.Traverse, Ugly.URI, Ugly.Image,
hunk ./debian/changelog 5
+ * Add the Ugly.HtmlParse module, which uses HaXml to parse HTML
+ and then converts it back to the Text.XHtml data type.
}