{-# LANGUAGE CPP #-}
--------------------------------------------------------------------------------
-- |
-- Module : Network.URI
-- Copyright : (c) 2004, Graham Klyne
-- License : BSD-style (see end of this file)
--
-- Maintainer : Graham Klyne <gk@ninebynine.org>
-- Stability : provisional
-- Portability : portable
--
-- This module defines functions for handling URIs. It presents substantially the
-- same interface as the older GHC Network.URI module, but is implemented using
-- Parsec rather than a Regex library that is not available with Hugs. The internal
-- representation of URI has been changed so that URI strings are more
-- completely preserved when round-tripping to a URI value and back.
--
-- In addition, four methods are provided for parsing different
-- kinds of URI string (as noted in RFC3986):
-- 'parseURI',
-- 'parseURIReference',
-- 'parseRelativeReference' and
-- 'parseAbsoluteURI'.
--
-- Further, four methods are provided for classifying different
-- kinds of URI string (as noted in RFC3986):
-- 'isURI',
-- 'isURIReference',
-- 'isRelativeReference' and
-- 'isAbsoluteURI'.
--
-- The long-standing official reference for URI handling was RFC2396 [1],
-- as updated by RFC 2732 [2], but this was replaced by a new specification,
-- RFC3986 [3] in January 2005. This latter specification has been used
-- as the primary reference for constructing the URI parser implemented
-- here, and it is intended that there is a direct relationship between
-- the syntax definition in that document and this parser implementation.
--
-- RFC 1808 [4] contains a number of test cases for relative URI handling.
-- Dan Connolly's Python module @uripath.py@ [5] also contains useful details
-- and test cases.
--
-- Some of the code has been copied from the previous GHC implementation,
-- but the parser is replaced with one that performs more complete
-- syntax checking of the URI itself, according to RFC3986 [3].
--
-- References
--
-- (1) <http://www.ietf.org/rfc/rfc2396.txt>
--
-- (2) <http://www.ietf.org/rfc/rfc2732.txt>
--
-- (3) <http://www.ietf.org/rfc/rfc3986.txt>
--
-- (4) <http://www.ietf.org/rfc/rfc1808.txt>
--
-- (5) <http://www.w3.org/2000/10/swap/uripath.py>
--
--------------------------------------------------------------------------------
module Network.URI
(
-- * The URI type
URI(..)
, URIAuth(..)
, nullURI
-- * Parsing
, parseURI
, parseURIReference
, parseRelativeReference
, parseAbsoluteURI
-- * Test for strings containing various kinds of URI
, isURI
, isURIReference
, isRelativeReference
, isAbsoluteURI
, isIPv6address
, isIPv4address
-- * Predicates
, uriIsAbsolute
, uriIsRelative
-- * Relative URIs
, relativeTo
, nonStrictRelativeTo
, relativeFrom
-- * Operations on URI strings
-- | Support for putting strings into URI-friendly
-- escaped format and getting them back again.
-- This can't be done transparently in all cases, because certain
-- characters have different meanings in different kinds of URI.
-- The URI spec [3], section 2.4, indicates that all URI components
-- should be escaped before they are assembled as a URI:
-- \"Once produced, a URI is always in its percent-encoded form\"
, uriToString
, isReserved, isUnreserved
, isAllowedInURI, isUnescapedInURI
, isUnescapedInURIComponent
, escapeURIChar
, escapeURIString
, unEscapeString
, pathSegments
-- * URI Normalization functions
, normalizeCase
, normalizeEscape
, normalizePathSegments
-- * Deprecated functions
, parseabsoluteURI
, escapeString
, reserved, unreserved
, scheme, authority, path, query, fragment
) where
import Text.ParserCombinators.Parsec
( GenParser, ParseError
, parse, (<?>), try
, option, many1, count, notFollowedBy
, char, satisfy, oneOf, string, eof
, unexpected
)
import Control.Applicative
import Control.Monad (MonadPlus(..))
import Control.DeepSeq (NFData(rnf), deepseq)
import Data.Char (ord, chr, isHexDigit, toLower, toUpper, digitToInt)
import Data.Bits ((.|.),(.&.),shiftL,shiftR)
import Data.List (unfoldr)
import Numeric (showIntAtBase)
#if !MIN_VERSION_base(4,8,0)
import Data.Traversable (sequenceA)
#endif
import Data.Typeable (Typeable)
#if MIN_VERSION_base(4,0,0)
import Data.Data (Data)
#else
import Data.Generics (Data)
#endif
#if MIN_VERSION_base(4,6,0)
import GHC.Generics (Generic)
#else
#endif
------------------------------------------------------------
-- The URI datatype
------------------------------------------------------------
-- |Represents a general universal resource identifier using
-- its component parts.
--
-- For example, for the URI
--
-- > foo://anonymous@www.haskell.org:42/ghc?query#frag
--
-- the components are:
--
data URI = URI
{ uriScheme :: String -- ^ @foo:@
, uriAuthority :: Maybe URIAuth -- ^ @\/\/anonymous\@www.haskell.org:42@
, uriPath :: String -- ^ @\/ghc@
, uriQuery :: String -- ^ @?query@
, uriFragment :: String -- ^ @#frag@
#if MIN_VERSION_base(4,6,0)
} deriving (Eq, Ord, Typeable, Data, Generic)
#else
} deriving (Eq, Ord, Typeable, Data)
#endif
instance NFData URI where
rnf (URI s a p q f)
= s `deepseq` a `deepseq` p `deepseq` q `deepseq` f `deepseq` ()
-- |Type for authority value within a URI
data URIAuth = URIAuth
{ uriUserInfo :: String -- ^ @anonymous\@@
, uriRegName :: String -- ^ @www.haskell.org@
, uriPort :: String -- ^ @:42@
} deriving (Eq, Ord, Show, Typeable, Data)
instance NFData URIAuth where
rnf (URIAuth ui rn p) = ui `deepseq` rn `deepseq` p `deepseq` ()
-- |Blank URI
nullURI :: URI
nullURI = URI
{ uriScheme = ""
, uriAuthority = Nothing
, uriPath = ""
, uriQuery = ""
, uriFragment = ""
}
-- URI as instance of Show. Note that for security reasons, the default
-- behaviour is to suppress any userinfo field (see RFC3986, section 7.5).
-- This can be overridden by using uriToString directly with first
-- argument @id@ (noting that this returns a ShowS value rather than a string).
--
-- [[[Another design would be to embed the userinfo mapping function in
-- the URIAuth value, with the default value suppressing userinfo formatting,
-- but providing a function to return a new URI value with userinfo
-- data exposed by show.]]]
--
instance Show URI where
showsPrec _ = uriToString defaultUserInfoMap
defaultUserInfoMap :: String -> String
defaultUserInfoMap uinf = user++newpass
where
(user,pass) = break (==':') uinf
newpass = if null pass || (pass == "@")
|| (pass == ":@")
then pass
else ":...@"
------------------------------------------------------------
-- Parse a URI
------------------------------------------------------------
-- |Turn a string containing a URI into a 'URI'.
-- Returns 'Nothing' if the string is not a valid URI;
-- (an absolute URI with optional fragment identifier).
--
-- NOTE: this is different from the previous network.URI,
-- whose @parseURI@ function works like 'parseURIReference'
-- in this module.
--
parseURI :: String -> Maybe URI
parseURI = parseURIAny uri
-- |Parse a URI reference to a 'URI' value.
-- Returns 'Nothing' if the string is not a valid URI reference.
-- (an absolute or relative URI with optional fragment identifier).
--
parseURIReference :: String -> Maybe URI
parseURIReference = parseURIAny uriReference
-- |Parse a relative URI to a 'URI' value.
-- Returns 'Nothing' if the string is not a valid relative URI.
-- (a relative URI with optional fragment identifier).
--
parseRelativeReference :: String -> Maybe URI
parseRelativeReference = parseURIAny relativeRef
-- |Parse an absolute URI to a 'URI' value.
-- Returns 'Nothing' if the string is not a valid absolute URI.
-- (an absolute URI without a fragment identifier).
--
parseAbsoluteURI :: String -> Maybe URI
parseAbsoluteURI = parseURIAny absoluteURI
-- |Test if string contains a valid URI
-- (an absolute URI with optional fragment identifier).
--
isURI :: String -> Bool
isURI = isValidParse uri
-- |Test if string contains a valid URI reference
-- (an absolute or relative URI with optional fragment identifier).
--
isURIReference :: String -> Bool
isURIReference = isValidParse uriReference
-- |Test if string contains a valid relative URI
-- (a relative URI with optional fragment identifier).
--
isRelativeReference :: String -> Bool
isRelativeReference = isValidParse relativeRef
-- |Test if string contains a valid absolute URI
-- (an absolute URI without a fragment identifier).
--
isAbsoluteURI :: String -> Bool
isAbsoluteURI = isValidParse absoluteURI
-- |Test if string contains a valid IPv6 address
--
isIPv6address :: String -> Bool
isIPv6address = isValidParse ipv6address
-- |Test if string contains a valid IPv4 address
--
isIPv4address :: String -> Bool
isIPv4address = isValidParse ipv4address
-- Helper function for turning a string into a URI
--
parseURIAny :: URIParser URI -> String -> Maybe URI
parseURIAny parser uristr = case parseAll parser "" uristr of
Left _ -> Nothing
Right u -> Just u
-- Helper function to test a string match to a parser
--
isValidParse :: URIParser a -> String -> Bool
isValidParse parser uristr = case parseAll parser "" uristr of
-- Left e -> error (show e)
Left _ -> False
Right _ -> True
parseAll :: URIParser a -> String -> String -> Either ParseError a
parseAll parser filename uristr = parse newparser filename uristr
where
newparser =
do { res <- parser
; eof
; return res
}
------------------------------------------------------------
-- Predicates
------------------------------------------------------------
uriIsAbsolute :: URI -> Bool
uriIsAbsolute (URI {uriScheme = scheme'}) = scheme' /= ""
uriIsRelative :: URI -> Bool
uriIsRelative = not . uriIsAbsolute
------------------------------------------------------------
-- URI parser body based on Parsec elements and combinators
------------------------------------------------------------
-- Parser parser type.
-- Currently
type URIParser a = GenParser Char () a
-- RFC3986, section 2.1
--
-- Parse and return a 'pct-encoded' sequence
--
escaped :: URIParser String
escaped = sequenceA [char '%', hexDigitChar, hexDigitChar]
-- RFC3986, section 2.2
--
-- |Returns 'True' if the character is a \"reserved\" character in a
-- URI. To include a literal instance of one of these characters in a
-- component of a URI, it must be escaped.
--
isReserved :: Char -> Bool
isReserved c = isGenDelims c || isSubDelims c
isGenDelims :: Char -> Bool
isGenDelims c = c `elem` ":/?#[]@"
isSubDelims :: Char -> Bool
isSubDelims c = c `elem` "!$&'()*+,;="
subDelims :: URIParser String
subDelims = (:[]) <$> oneOf "!$&'()*+,;="
-- RFC3986, section 2.3
--
-- |Returns 'True' if the character is an \"unreserved\" character in
-- a URI. These characters do not need to be escaped in a URI. The
-- only characters allowed in a URI are either \"reserved\",
-- \"unreserved\", or an escape sequence (@%@ followed by two hex digits).
--
isUnreserved :: Char -> Bool
isUnreserved c = isAlphaNumChar c || (c `elem` "-_.~")
unreservedChar :: URIParser String
unreservedChar = (:[]) <$> satisfy isUnreserved
-- RFC3986, section 3
--
-- URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
--
-- hier-part = "//" authority path-abempty
-- / path-abs
-- / path-rootless
-- / path-empty
uri :: URIParser URI
uri =
do { us <- try uscheme
-- ; ua <- option Nothing ( do { try (string "//") ; uauthority } )
-- ; up <- upath
; (ua,up) <- hierPart
; uq <- option "" ( do { _ <- char '?' ; uquery } )
; uf <- option "" ( do { _ <- char '#' ; ufragment } )
; return $ URI
{ uriScheme = us
, uriAuthority = ua
, uriPath = up
, uriQuery = uq
, uriFragment = uf
}
}
hierPart :: URIParser ((Maybe URIAuth),String)
hierPart =
do { _ <- try (string "//")
; ua <- uauthority
; up <- pathAbEmpty
; return (ua,up)
}
<|> do { up <- pathAbs
; return (Nothing,up)
}
<|> do { up <- pathRootLess
; return (Nothing,up)
}
<|> do { return (Nothing,"")
}
-- RFC3986, section 3.1
uscheme :: URIParser String
uscheme =
do { s <- oneThenMany alphaChar (satisfy isSchemeChar)
; _ <- char ':'
; return $ s++":"
}
-- RFC3986, section 3.2
uauthority :: URIParser (Maybe URIAuth)
uauthority =
do { uu <- option "" (try userinfo)
; uh <- host
; up <- option "" port
; return $ Just $ URIAuth
{ uriUserInfo = uu
, uriRegName = uh
, uriPort = up
}
}
-- RFC3986, section 3.2.1
userinfo :: URIParser String
userinfo =
do { uu <- many (uchar ";:&=+$,")
; _ <- char '@'
; return (concat uu ++"@")
}
-- RFC3986, section 3.2.2
host :: URIParser String
host = ipLiteral <|> try ipv4address <|> regName
ipLiteral :: URIParser String
ipLiteral =
do { _ <- char '['
; ua <- ( ipv6address <|> ipvFuture )
; _ <- char ']'
; return $ "[" ++ ua ++ "]"
}
<?> "IP address literal"
ipvFuture :: URIParser String
ipvFuture =
do { _ <- char 'v'
; h <- hexDigitChar
; _ <- char '.'
; a <- many1 (satisfy isIpvFutureChar)
; return $ 'v':h:'.':a
}
isIpvFutureChar :: Char -> Bool
isIpvFutureChar c = isUnreserved c || isSubDelims c || (c==';')
ipv6address :: URIParser String
ipv6address =
try ( do
{ a2 <- count 6 h4c
; a3 <- ls32
; return $ concat a2 ++ a3
} )
<|> try ( do
{ _ <- string "::"
; a2 <- count 5 h4c
; a3 <- ls32
; return $ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 0
; _ <- string "::"
; a2 <- count 4 h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 1
; _ <- string "::"
; a2 <- count 3 h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 2
; _ <- string "::"
; a2 <- count 2 h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 3
; _ <- string "::"
; a2 <- h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 4
; _ <- string "::"
; a3 <- ls32
; return $ a1 ++ "::" ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 5
; _ <- string "::"
; a3 <- h4
; return $ a1 ++ "::" ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 6
; _ <- string "::"
; return $ a1 ++ "::"
} )
<?> "IPv6 address"
opt_n_h4c_h4 :: Int -> URIParser String
opt_n_h4c_h4 n = option "" $
do { a1 <- countMinMax 0 n h4c
; a2 <- h4
; return $ concat a1 ++ a2
}
ls32 :: URIParser String
ls32 = try ( do
{ a1 <- h4c
; a2 <- h4
; return (a1++a2)
} )
<|> ipv4address
h4c :: URIParser String
h4c = try $
do { a1 <- h4
; _ <- char ':'
; _ <- notFollowedBy (char ':')
; return $ a1 ++ ":"
}
h4 :: URIParser String
h4 = countMinMax 1 4 hexDigitChar
ipv4address :: URIParser String
ipv4address =
do { a1 <- decOctet ; _ <- char '.'
; a2 <- decOctet ; _ <- char '.'
; a3 <- decOctet ; _ <- char '.'
; a4 <- decOctet
; _ <- notFollowedBy nameChar
; return $ a1++"."++a2++"."++a3++"."++a4
}
<?> "IPv4 Address"
decOctet :: URIParser String
decOctet =
do { a1 <- countMinMax 1 3 digitChar
; if (read a1 :: Integer) > 255 then
fail "Decimal octet value too large"
else
return a1
}
regName :: URIParser String
regName =
do { ss <- countMinMax 0 255 nameChar
; return $ concat ss
}
<?> "Registered name"
nameChar :: URIParser String
nameChar = (unreservedChar <|> escaped <|> subDelims)
<?> "Name character"
-- RFC3986, section 3.2.3
port :: URIParser String
port =
do { _ <- char ':'
; p <- many digitChar
; return (':':p)
}
--
-- RFC3986, section 3.3
--
-- path = path-abempty ; begins with "/" or is empty
-- / path-abs ; begins with "/" but not "//"
-- / path-noscheme ; begins with a non-colon segment
-- / path-rootless ; begins with a segment
-- / path-empty ; zero characters
--
-- path-abempty = *( "/" segment )
-- path-abs = "/" [ segment-nz *( "/" segment ) ]
-- path-noscheme = segment-nzc *( "/" segment )
-- path-rootless = segment-nz *( "/" segment )
-- path-empty = 0<pchar>
--
-- segment = *pchar
-- segment-nz = 1*pchar
-- segment-nzc = 1*( unreserved / pct-encoded / sub-delims / "@" )
--
-- pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
{-
upath :: URIParser String
upath = pathAbEmpty
<|> pathAbs
<|> pathNoScheme
<|> pathRootLess
<|> pathEmpty
-}
pathAbEmpty :: URIParser String
pathAbEmpty =
do { ss <- many slashSegment
; return $ concat ss
}
pathAbs :: URIParser String
pathAbs =
do { _ <- char '/'
; ss <- option "" pathRootLess
; return $ '/':ss
}
pathNoScheme :: URIParser String
pathNoScheme =
do { s1 <- segmentNzc
; ss <- many slashSegment
; return $ concat (s1:ss)
}
pathRootLess :: URIParser String
pathRootLess =
do { s1 <- segmentNz
; ss <- many slashSegment
; return $ concat (s1:ss)
}
slashSegment :: URIParser String
slashSegment =
do { _ <- char '/'
; s <- segment
; return ('/':s)
}
segment :: URIParser String
segment =
do { ps <- many pchar
; return $ concat ps
}
segmentNz :: URIParser String
segmentNz =
do { ps <- many1 pchar
; return $ concat ps
}
segmentNzc :: URIParser String
segmentNzc =
do { ps <- many1 (uchar "@")
; return $ concat ps
}
pchar :: URIParser String
pchar = uchar ":@"
-- helper function for pchar and friends
uchar :: String -> URIParser String
uchar extras =
unreservedChar
<|> escaped
<|> subDelims
<|> do { c <- oneOf extras ; return [c] }
-- RFC3986, section 3.4
uquery :: URIParser String
uquery =
do { ss <- many $ uchar (":@"++"/?")
; return $ '?':concat ss
}
-- RFC3986, section 3.5
ufragment :: URIParser String
ufragment =
do { ss <- many $ uchar (":@"++"/?")
; return $ '#':concat ss
}
-- Reference, Relative and Absolute URI forms
--
-- RFC3986, section 4.1
uriReference :: URIParser URI
uriReference = uri <|> relativeRef
-- RFC3986, section 4.2
--
-- relative-URI = relative-part [ "?" query ] [ "#" fragment ]
--
-- relative-part = "//" authority path-abempty
-- / path-abs
-- / path-noscheme
-- / path-empty
relativeRef :: URIParser URI
relativeRef =
do { notMatching uscheme
-- ; ua <- option Nothing ( do { try (string "//") ; uauthority } )
-- ; up <- upath
; (ua,up) <- relativePart
; uq <- option "" ( do { _ <- char '?' ; uquery } )
; uf <- option "" ( do { _ <- char '#' ; ufragment } )
; return $ URI
{ uriScheme = ""
, uriAuthority = ua
, uriPath = up
, uriQuery = uq
, uriFragment = uf
}
}
relativePart :: URIParser ((Maybe URIAuth),String)
relativePart =
do { _ <- try (string "//")
; ua <- uauthority
; up <- pathAbEmpty
; return (ua,up)
}
<|> do { up <- pathAbs
; return (Nothing,up)
}
<|> do { up <- pathNoScheme
; return (Nothing,up)
}
<|> do { return (Nothing,"")
}
-- RFC3986, section 4.3
absoluteURI :: URIParser URI
absoluteURI =
do { us <- uscheme
-- ; ua <- option Nothing ( do { try (string "//") ; uauthority } )
-- ; up <- upath
; (ua,up) <- hierPart
; uq <- option "" ( do { _ <- char '?' ; uquery } )
; return $ URI
{ uriScheme = us
, uriAuthority = ua
, uriPath = up
, uriQuery = uq
, uriFragment = ""
}
}
-- Imports from RFC 2234
-- NOTE: can't use isAlphaNum etc. because these deal with ISO 8859
-- (and possibly Unicode!) chars.
-- [[[Above was a comment originally in GHC Network/URI.hs:
-- when IRIs are introduced then most codepoints above 128(?) should
-- be treated as unreserved, and higher codepoints for letters should
-- certainly be allowed.
-- ]]]
isAlphaChar :: Char -> Bool
isAlphaChar c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
isDigitChar :: Char -> Bool
isDigitChar c = (c >= '0' && c <= '9')
isAlphaNumChar :: Char -> Bool
isAlphaNumChar c = isAlphaChar c || isDigitChar c
isHexDigitChar :: Char -> Bool
isHexDigitChar c = isHexDigit c
isSchemeChar :: Char -> Bool
isSchemeChar c = (isAlphaNumChar c) || (c `elem` "+-.")
alphaChar :: URIParser Char
alphaChar = satisfy isAlphaChar -- or: Parsec.letter ?
digitChar :: URIParser Char
digitChar = satisfy isDigitChar -- or: Parsec.digit ?
hexDigitChar :: URIParser Char
hexDigitChar = satisfy isHexDigitChar -- or: Parsec.hexDigit ?
-- Additional parser combinators for common patterns
oneThenMany :: GenParser t s a -> GenParser t s a -> GenParser t s [a]
oneThenMany p1 pr =
do { a1 <- p1
; ar <- many pr
; return (a1:ar)
}
countMinMax :: Int -> Int -> GenParser t s a -> GenParser t s [a]
countMinMax m n p | m > 0 =
do { a1 <- p
; ar <- countMinMax (m-1) (n-1) p
; return (a1:ar)
}
countMinMax _ n _ | n <= 0 = return []
countMinMax _ n p = option [] $
do { a1 <- p
; ar <- countMinMax 0 (n-1) p
; return (a1:ar)
}
notMatching :: Show a => GenParser tok st a -> GenParser tok st ()
notMatching p = do { a <- try p ; unexpected (show a) } <|> return ()
------------------------------------------------------------
-- Reconstruct a URI string
------------------------------------------------------------
--
-- |Turn a 'URI' into a string.
--
-- Uses a supplied function to map the userinfo part of the URI.
--
-- The Show instance for URI uses a mapping that hides any password
-- that may be present in the URI. Use this function with argument @id@
-- to preserve the password in the formatted output.
--
uriToString :: (String->String) -> URI -> ShowS
uriToString userinfomap URI { uriScheme=myscheme
, uriAuthority=myauthority
, uriPath=mypath
, uriQuery=myquery
, uriFragment=myfragment
} =
(myscheme++) . (uriAuthToString userinfomap myauthority)
. (mypath++) . (myquery++) . (myfragment++)
uriAuthToString :: (String->String) -> (Maybe URIAuth) -> ShowS
uriAuthToString _ Nothing = id -- shows ""
uriAuthToString userinfomap
(Just URIAuth { uriUserInfo = myuinfo
, uriRegName = myregname
, uriPort = myport
} ) =
("//"++) . (if null myuinfo then id else ((userinfomap myuinfo)++))
. (myregname++)
. (myport++)
------------------------------------------------------------
-- Character classes
------------------------------------------------------------
-- | Returns 'True' if the character is allowed in a URI.
--
isAllowedInURI :: Char -> Bool
isAllowedInURI c = isReserved c || isUnreserved c || c == '%' -- escape char
-- | Returns 'True' if the character is allowed unescaped in a URI.
--
-- >>> escapeURIString isUnescapedInURI "http://haskell.org:80?some_param=true&other_param=їґ"
-- "http://haskell.org:80?some_param=true&other_param=%D1%97%D2%91"
isUnescapedInURI :: Char -> Bool
isUnescapedInURI c = isReserved c || isUnreserved c
-- | Returns 'True' if the character is allowed unescaped in a URI component.
--
-- >>> escapeURIString isUnescapedInURIComponent "http://haskell.org:80?some_param=true&other_param=їґ"
-- "http%3A%2F%2Fhaskell.org%3A80%3Fsome_param%3Dtrue%26other_param%3D%D1%97%D2%91"
isUnescapedInURIComponent :: Char -> Bool
isUnescapedInURIComponent c = not (isReserved c || not (isUnescapedInURI c))
------------------------------------------------------------
-- Escape sequence handling
------------------------------------------------------------
-- |Escape character if supplied predicate is not satisfied,
-- otherwise return character as singleton string.
--
escapeURIChar :: (Char->Bool) -> Char -> String
escapeURIChar p c
| p c = [c]
| otherwise = concatMap (\i -> '%' : myShowHex i "") (utf8EncodeChar c)
where
myShowHex :: Int -> ShowS
myShowHex n r = case showIntAtBase 16 (toChrHex) n r of
[] -> "00"
[x] -> ['0',x]
cs -> cs
toChrHex d
| d < 10 = chr (ord '0' + fromIntegral d)
| otherwise = chr (ord 'A' + fromIntegral (d - 10))
-- From http://hackage.haskell.org/package/utf8-string
-- by Eric Mertens, BSD3
-- Returns [Int] for use with showIntAtBase
utf8EncodeChar :: Char -> [Int]
utf8EncodeChar = map fromIntegral . go . ord
where
go oc
| oc <= 0x7f = [oc]
| oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6)
, 0x80 + oc .&. 0x3f
]
| oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12)
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
, 0x80 + oc .&. 0x3f
]
| otherwise = [ 0xf0 + (oc `shiftR` 18)
, 0x80 + ((oc `shiftR` 12) .&. 0x3f)
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
, 0x80 + oc .&. 0x3f
]
-- |Can be used to make a string valid for use in a URI.
--
escapeURIString
:: (Char->Bool) -- ^ a predicate which returns 'False'
-- if the character should be escaped
-> String -- ^ the string to process
-> String -- ^ the resulting URI string
escapeURIString p s = concatMap (escapeURIChar p) s
-- |Turns all instances of escaped characters in the string back
-- into literal characters.
--
unEscapeString :: String -> String
unEscapeString [] = ""
unEscapeString s@(c:cs) = case unEscapeByte s of
Just (byte, rest) -> unEscapeUtf8 byte rest
Nothing -> c : unEscapeString cs
unEscapeByte :: String -> Maybe (Int, String)
unEscapeByte ('%':x1:x2:s) | isHexDigit x1 && isHexDigit x2 =
Just (digitToInt x1 * 16 + digitToInt x2, s)
unEscapeByte _ = Nothing
-- Adapted from http://hackage.haskell.org/package/utf8-string
-- by Eric Mertens, BSD3
unEscapeUtf8 :: Int -> String -> String
unEscapeUtf8 c rest
| c < 0x80 = chr c : unEscapeString rest
| c < 0xc0 = replacement_character : unEscapeString rest
| c < 0xe0 = multi1
| c < 0xf0 = multi_byte 2 0xf 0x800
| c < 0xf8 = multi_byte 3 0x7 0x10000
| c < 0xfc = multi_byte 4 0x3 0x200000
| c < 0xfe = multi_byte 5 0x1 0x4000000
| otherwise = replacement_character : unEscapeString rest
where
replacement_character = '\xfffd'
multi1 = case unEscapeByte rest of
Just (c1, ds) | c1 .&. 0xc0 == 0x80 ->
let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
in if d >= 0x000080 then toEnum d : unEscapeString ds
else replacement_character : unEscapeString ds
_ -> replacement_character : unEscapeString rest
multi_byte :: Int -> Int -> Int -> String
multi_byte i mask overlong =
aux i rest (unEscapeByte rest) (c .&. mask)
where
aux 0 rs _ acc
| overlong <= acc && acc <= 0x10ffff &&
(acc < 0xd800 || 0xdfff < acc) &&
(acc < 0xfffe || 0xffff < acc) = chr acc : unEscapeString rs
| otherwise = replacement_character : unEscapeString rs
aux n _ (Just (r, rs)) acc
| r .&. 0xc0 == 0x80 = aux (n-1) rs (unEscapeByte rs)
$! shiftL acc 6 .|. (r .&. 0x3f)
aux _ rs _ _ = replacement_character : unEscapeString rs
------------------------------------------------------------
-- Resolving a relative URI relative to a base URI
------------------------------------------------------------
-- |Returns a new 'URI' which represents the value of the
-- first 'URI' interpreted as relative to the second 'URI'.
-- For example:
--
-- > "foo" `relativeTo` "http://bar.org/" = "http://bar.org/foo"
-- > "http:foo" `nonStrictRelativeTo` "http://bar.org/" = "http://bar.org/foo"
--
-- Algorithm from RFC3986 [3], section 5.2.2
--
nonStrictRelativeTo :: URI -> URI -> URI
nonStrictRelativeTo ref base = relativeTo ref' base
where
ref' = if uriScheme ref == uriScheme base
then ref { uriScheme="" }
else ref
isDefined :: ( MonadPlus m, Eq (m a) ) => m a -> Bool
isDefined a = a /= mzero
-- | Returns a new 'URI' which represents the value of the first 'URI'
-- interpreted as relative to the second 'URI'.
--
-- Algorithm from RFC3986 [3], section 5.2
relativeTo :: URI -> URI -> URI
relativeTo ref base
| isDefined ( uriScheme ref ) =
just_segments ref
| isDefined ( uriAuthority ref ) =
just_segments ref { uriScheme = uriScheme base }
| isDefined ( uriPath ref ) =
if (head (uriPath ref) == '/') then
just_segments ref
{ uriScheme = uriScheme base
, uriAuthority = uriAuthority base
}
else
just_segments ref
{ uriScheme = uriScheme base
, uriAuthority = uriAuthority base
, uriPath = mergePaths base ref
}
| isDefined ( uriQuery ref ) =
just_segments ref
{ uriScheme = uriScheme base
, uriAuthority = uriAuthority base
, uriPath = uriPath base
}
| otherwise =
just_segments ref
{ uriScheme = uriScheme base
, uriAuthority = uriAuthority base
, uriPath = uriPath base
, uriQuery = uriQuery base
}
where
just_segments u =
u { uriPath = removeDotSegments (uriPath u) }
mergePaths b r
| isDefined (uriAuthority b) && null pb = '/':pr
| otherwise = dropLast pb ++ pr
where
pb = uriPath b
pr = uriPath r
dropLast = fst . splitLast -- reverse . dropWhile (/='/') . reverse
-- Remove dot segments, but protect leading '/' character
removeDotSegments :: String -> String
removeDotSegments ('/':ps) = '/':elimDots ps []
removeDotSegments ps = elimDots ps []
-- Second arg accumulates segments processed so far in reverse order
elimDots :: String -> [String] -> String
-- elimDots ps rs | traceVal "\nps " ps $ traceVal "rs " rs $ False = error ""
elimDots [] [] = ""
elimDots [] rs = concat (reverse rs)
elimDots ( '.':'/':ps) rs = elimDots ps rs
elimDots ( '.':[] ) rs = elimDots [] rs
elimDots ( '.':'.':'/':ps) rs = elimDots ps (drop 1 rs)
elimDots ( '.':'.':[] ) rs = elimDots [] (drop 1 rs)
elimDots ps rs = elimDots ps1 (r:rs)
where
(r,ps1) = nextSegment ps
-- Returns the next segment and the rest of the path from a path string.
-- Each segment ends with the next '/' or the end of string.
--
nextSegment :: String -> (String,String)
nextSegment ps =
case break (=='/') ps of
(r,'/':ps1) -> (r++"/",ps1)
(r,_) -> (r,[])
segments :: String -> [String]
segments str = dropLeadingEmpty $ unfoldr nextSegmentMaybe str
where
nextSegmentMaybe "" = Nothing
nextSegmentMaybe ps =
case break (=='/') ps of
(seg, '/':ps1) -> Just (seg, ps1)
(seg, _) -> Just (seg, "")
dropLeadingEmpty ("":xs) = xs
dropLeadingEmpty xs = xs
-- | Returns the segments of the path component. E.g.,
-- pathSegments <$> parseURI "http://example.org/foo/bar/baz"
-- == ["foo", "bar", "baz"]
pathSegments :: URI -> [String]
pathSegments = segments . uriPath
-- | Split last (name) segment from path, returning (path,name)
splitLast :: String -> (String,String)
splitLast p = (reverse revpath,reverse revname)
where
(revname,revpath) = break (=='/') $ reverse p
------------------------------------------------------------
-- Finding a URI relative to a base URI
------------------------------------------------------------
-- |Returns a new 'URI' which represents the relative location of
-- the first 'URI' with respect to the second 'URI'. Thus, the
-- values supplied are expected to be absolute URIs, and the result
-- returned may be a relative URI.
--
-- Example:
--
-- > "http://example.com/Root/sub1/name2#frag"
-- > `relativeFrom` "http://example.com/Root/sub2/name2#frag"
-- > == "../sub1/name2#frag"
--
-- There is no single correct implementation of this function,
-- but any acceptable implementation must satisfy the following:
--
-- > (uabs `relativeFrom` ubase) `relativeTo` ubase == uabs
--
-- For any valid absolute URI.
-- (cf. <http://lists.w3.org/Archives/Public/uri/2003Jan/0008.html>
-- <http://lists.w3.org/Archives/Public/uri/2003Jan/0005.html>)
--
relativeFrom :: URI -> URI -> URI
relativeFrom uabs base
| diff uriScheme uabs base = uabs
| diff uriAuthority uabs base = uabs { uriScheme = "" }
| diff uriPath uabs base = uabs
{ uriScheme = ""
, uriAuthority = Nothing
, uriPath = relPathFrom (removeBodyDotSegments $ uriPath uabs)
(removeBodyDotSegments $ uriPath base)
}
| diff uriQuery uabs base = uabs
{ uriScheme = ""
, uriAuthority = Nothing
, uriPath = ""
}
| otherwise = uabs -- Always carry fragment from uabs
{ uriScheme = ""
, uriAuthority = Nothing
, uriPath = ""
, uriQuery = ""
}
where
diff :: Eq b => (a -> b) -> a -> a -> Bool
diff sel u1 u2 = sel u1 /= sel u2
-- Remove dot segments except the final segment
removeBodyDotSegments p = removeDotSegments p1 ++ p2
where
(p1,p2) = splitLast p
relPathFrom :: String -> String -> String
relPathFrom [] _ = "/"
relPathFrom pabs [] = pabs
relPathFrom pabs base = -- Construct a relative path segments
if sa1 == sb1 -- if the paths share a leading segment
then if (sa1 == "/") -- other than a leading '/'
then if (sa2 == sb2)
then relPathFrom1 ra2 rb2
else pabs
else relPathFrom1 ra1 rb1
else pabs
where
(sa1,ra1) = nextSegment pabs
(sb1,rb1) = nextSegment base
(sa2,ra2) = nextSegment ra1
(sb2,rb2) = nextSegment rb1
-- relPathFrom1 strips off trailing names from the supplied paths,
-- and calls difPathFrom to find the relative path from base to
-- target
relPathFrom1 :: String -> String -> String
relPathFrom1 pabs base = relName
where
(sa,na) = splitLast pabs
(sb,nb) = splitLast base
rp = relSegsFrom sa sb
relName = if null rp then
if (na == nb) then ""
else if protect na then "./"++na
else na
else
rp++na
-- Precede name with some path if it is null or contains a ':'
protect s = null s || ':' `elem` s
-- relSegsFrom discards any common leading segments from both paths,
-- then invokes difSegsFrom to calculate a relative path from the end
-- of the base path to the end of the target path.
-- The final name is handled separately, so this deals only with
-- "directory" segtments.
--
relSegsFrom :: String -> String -> String
{-
relSegsFrom sabs base
| traceVal "\nrelSegsFrom\nsabs " sabs $ traceVal "base " base $
False = error ""
-}
relSegsFrom [] [] = "" -- paths are identical
relSegsFrom sabs base =
if sa1 == sb1
then relSegsFrom ra1 rb1
else difSegsFrom sabs base
where
(sa1,ra1) = nextSegment sabs
(sb1,rb1) = nextSegment base
-- difSegsFrom calculates a path difference from base to target,
-- not including the final name at the end of the path
-- (i.e. results always ends with '/')
--
-- This function operates under the invariant that the supplied
-- value of sabs is the desired path relative to the beginning of
-- base. Thus, when base is empty, the desired path has been found.
--
difSegsFrom :: String -> String -> String
{-
difSegsFrom sabs base
| traceVal "\ndifSegsFrom\nsabs " sabs $ traceVal "base " base $
False = error ""
-}
difSegsFrom sabs "" = sabs
difSegsFrom sabs base = difSegsFrom ("../"++sabs) (snd $ nextSegment base)
------------------------------------------------------------
-- Other normalization functions
------------------------------------------------------------
-- |Case normalization; cf. RFC3986 section 6.2.2.1
-- NOTE: authority case normalization is not performed
--
normalizeCase :: String -> String
normalizeCase uristr = ncScheme uristr
where
ncScheme (':':cs) = ':':ncEscape cs
ncScheme (c:cs) | isSchemeChar c = toLower c:ncScheme cs
ncScheme _ = ncEscape uristr -- no scheme present
ncEscape ('%':h1:h2:cs) = '%':toUpper h1:toUpper h2:ncEscape cs
ncEscape (c:cs) = c:ncEscape cs
ncEscape [] = []
-- |Encoding normalization; cf. RFC3986 section 6.2.2.2
--
normalizeEscape :: String -> String
normalizeEscape ('%':h1:h2:cs)
| isHexDigit h1 && isHexDigit h2 && isUnreserved escval =
escval:normalizeEscape cs
where
escval = chr (digitToInt h1*16+digitToInt h2)
normalizeEscape (c:cs) = c:normalizeEscape cs
normalizeEscape [] = []
-- |Path segment normalization; cf. RFC3986 section 6.2.2.3
--
normalizePathSegments :: String -> String
normalizePathSegments uristr = normstr juri
where
juri = parseURI uristr
normstr Nothing = uristr
normstr (Just u) = show (normuri u)
normuri u = u { uriPath = removeDotSegments (uriPath u) }
------------------------------------------------------------
-- Deprecated functions
------------------------------------------------------------
{-# DEPRECATED parseabsoluteURI "use parseAbsoluteURI" #-}
parseabsoluteURI :: String -> Maybe URI
parseabsoluteURI = parseAbsoluteURI
{-# DEPRECATED escapeString "use escapeURIString, and note the flipped arguments" #-}
escapeString :: String -> (Char->Bool) -> String
escapeString = flip escapeURIString
{-# DEPRECATED reserved "use isReserved" #-}
reserved :: Char -> Bool
reserved = isReserved
{-# DEPRECATED unreserved "use isUnreserved" #-}
unreserved :: Char -> Bool
unreserved = isUnreserved
-- Additional component access functions for backward compatibility
{-# DEPRECATED scheme "use uriScheme" #-}
scheme :: URI -> String
scheme = orNull init . uriScheme
{-# DEPRECATED authority "use uriAuthority, and note changed functionality" #-}
authority :: URI -> String
authority = dropss . ($"") . uriAuthToString id . uriAuthority
where
-- Old-style authority component does not include leading '//'
dropss ('/':'/':s) = s
dropss s = s
{-# DEPRECATED path "use uriPath" #-}
path :: URI -> String
path = uriPath
{-# DEPRECATED query "use uriQuery, and note changed functionality" #-}
query :: URI -> String
query = orNull tail . uriQuery
{-# DEPRECATED fragment "use uriFragment, and note changed functionality" #-}
fragment :: URI -> String
fragment = orNull tail . uriFragment
orNull :: ([a]->[a]) -> [a] -> [a]
orNull _ [] = []
orNull f as = f as
--------------------------------------------------------------------------------
--
-- Copyright (c) 2004, G. KLYNE. All rights reserved.
-- Distributed as free software under the following license.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions
-- are met:
--
-- - Redistributions of source code must retain the above copyright notice,
-- this list of conditions and the following disclaimer.
--
-- - Redistributions in binary form must reproduce the above copyright
-- notice, this list of conditions and the following disclaimer in the
-- documentation and/or other materials provided with the distribution.
--
-- - Neither name of the copyright holders nor the names of its
-- contributors may be used to endorse or promote products derived from
-- this software without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-- HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-- OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-- TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
--------------------------------------------------------------------------------