{-# LANGUAGE MultiParamTypeClasses, TypeSynonymInstances, DeriveDataTypeable #-}
module Encoding.Octets
    ( Octets(..)
    , pack
    , unpack
    , emptyOct
    ) where

import Data.Char
import Data.Generics
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import Text.Regex
import Encoding
import Encoding.Html

-- |The values of CGI variables are returned as a packed string of
-- Char8.  On top of this is an encoding of the unicode characters,
-- such as '&#39640;' and so on.  I don't know what the full
-- specification of this encoding is, presumably it is a function of
-- the different web servers and perhaps the browsers as well.
newtype Octets = Octets BL.ByteString deriving (Read, Show, Eq, Ord, Typeable, Data)

pack = Octets . BL.pack . map (fromInteger . toInteger . ord)
unpack (Octets s) = map (chr . fromInteger . toInteger) . BL.unpack $ s
emptyOct = Octets BL.empty

-- Octets is the encoding of CGI variable values.
instance Encoding Octets String where
    decode s = decodeString (unpack s)
        where decodeString ('\r':'\n' : t) = '\n' : decodeString t
              decodeString s = decodeCharRef s
              decodeCharRef s =
                  case matchRegexAll (mkRegexWithOpts "^&#([0-9]+);" False True) s of
                    Just ("", _, t, [n])
                      | read n < (0x110000 :: Integer) -> chr (read n) : decodeString t
                      | otherwise -> decodeEntityRef s
                    Nothing -> decodeEntityRef s
                    x -> error $ "Internal error in Octets.toUnicode " ++ show x
              decodeEntityRef s = 
                  case matchRegexAll (mkRegexWithOpts "^&(.*);" False True) s of
                    Just ("", _, t, [w]) ->
                        case entityOfName w of
                                Just c -> c : decodeString t
                                Nothing -> decodeOther s
                    Nothing -> decodeOther s
                    _ -> error $ "Internal error in Octets.toUnicode (2)"
              decodeOther (h : t) = h : decodeString t
              decodeOther [] = []
    encode unicode = 
        pack (encodeString unicode)
        where encodeString [] = ""
              encodeString (x : xs) =
                  (case entityToName x of
                     Just w -> "&" ++ w ++ ";"
                     Nothing -> 
                         -- See Text.XHtml.Internals, 0xff is encoded to "&#255;".
                         if ord x >= 0xff
                         then "&#" ++ show (ord x) ++ ";"
                         else if x == '\n'
                              then "\r\n"
                              else [x]) ++ encodeString xs

instance Monoid Octets where
    mempty = Octets (BL.empty)
    mappend (Octets s1) (Octets s2) = Octets (BL.append s1 s2)
    mconcat xs = Octets (BL.concat (map (\ (Octets s) -> s) xs))
