Source code for cdms2.cdurlparse

"""
Parse (absolute and relative) URLs.  See RFC 1808: "Relative Uniform
Resource Locators", by R. Fielding, UC Irvine, June 1995.
"""

from __future__ import print_function
# Standard/builtin Python modules
import string

# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'ldap', 'gopher', 'nntp', 'wais', 'file',
                 'https', 'shttp',
                 'prospero', '']
uses_netloc = ['ftp', 'http', 'ldap', 'gopher', 'nntp', 'telnet', 'wais',
               'file',
               'https', 'shttp', 'snews',
               'prospero', '']
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
                    'snews',
                    ]
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'ldap',
               'https', 'shttp',
               '']
uses_query = ['http', 'ldap', 'wais',
              'https', 'shttp',
              'gopher',
              '']
uses_fragment = ['ftp', 'hdl', 'http', 'ldap', 'gopher', 'news', 'nntp', 'wais',
                 'https', 'shttp', 'snews',
                 'file', 'prospero', '']

# Characters valid in scheme names
scheme_chars = string.ascii_letters + string.digits + '+-.'

MAX_CACHE_SIZE = 20
_parse_cache = {}


[docs]def clear_cache():
    """Clear the parse cache."""
    global _parse_cache
    _parse_cache = {}


# Parse a URL into 6 components:
# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
# Note that we don't break the components up in smaller bits
# (e.g. netloc is a single string) and we don't expand % escapes.
[docs]def urlparse(url, scheme='', allow_fragments=1):
    key = url, scheme, allow_fragments
    cached = _parse_cache.get(key, None)
    if cached:
        return cached
    if len(_parse_cache) >= MAX_CACHE_SIZE:  # avoid runaway growth
        clear_cache()
    netloc = params = query = fragment = ''
    i = url.find(':')
    if i > 0:
        if url[:i] in ['http', 'ldap']:  # optimize the common case
            scheme = string.lower(url[:i])
            url = url[i + 1:]
            if url[:2] == '//':
                i = url.find('/', 2)
                if i < 0:
                    i = len(url)
                netloc = url[2:i]
                url = url[i:]
            if allow_fragments:
                i = url.rfind('#')
                if i >= 0:
                    fragment = url[i + 1:]
                    url = url[:i]
            i = url.find('?')
            if i >= 0:
                query = url[i + 1:]
                url = url[:i]
            i = url.find(';')
            if i >= 0:
                params = url[i + 1:]
                url = url[:i]
            tuple = scheme, netloc, url, params, query, fragment
            _parse_cache[key] = tuple
            return tuple
        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
            scheme, url = string.lower(url[:i]), url[i + 1:]
    if scheme in uses_netloc:
        if url[:2] == '//':
            i = url.find('/', 2)
            if i < 0:
                i = len(url)
            netloc, url = url[2:i], url[i:]
    if allow_fragments and scheme in uses_fragment:
        i = url.rfind('#')
        if i >= 0:
            url, fragment = url[:i], url[i + 1:]
    if scheme in uses_query:
        i = url.find('?')
        if i >= 0:
            url, query = url[:i], url[i + 1:]
    if scheme in uses_params:
        i = url.find(';')
        if i >= 0:
            url, params = url[:i], url[i + 1:]
    tuple = scheme, netloc, url, params, query, fragment
    _parse_cache[key] = tuple
    return tuple

# Put a parsed URL back together again.  This may result in a slightly
# different, but equivalent URL, if the URL that was parsed originally
# had redundant delimiters, e.g. a ? with an empty query (the draft
# states that these are equivalent).


[docs]def urlunparse(xxx_todo_changeme):
    (scheme, netloc, url, params, query, fragment) = xxx_todo_changeme
    if netloc or (scheme in uses_netloc and url[:2] == '//'):
        if url[:1] != '/':
            url = '/' + url
        url = '//' + (netloc or '') + url
    if scheme:
        url = scheme + ':' + url
    if params:
        url = url + ';' + params
    if query:
        url = url + '?' + query
    if fragment:
        url = url + '#' + fragment
    return url

# Join a base URL and a possibly relative URL to form an absolute
# interpretation of the latter.


[docs]def urljoin(base, url, allow_fragments=1):
    if not base:
        return url
    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
        urlparse(base, '', allow_fragments)
    scheme, netloc, path, params, query, fragment = \
        urlparse(url, bscheme, allow_fragments)
    if scheme != bscheme or scheme not in uses_relative:
        return urlunparse((scheme, netloc, path,
                           params, query, fragment))
    if scheme in uses_netloc:
        if netloc:
            return urlunparse((scheme, netloc, path,
                               params, query, fragment))
        netloc = bnetloc
    if path[:1] == '/':
        return urlunparse((scheme, netloc, path,
                           params, query, fragment))
    if not path:
        return urlunparse((scheme, netloc, bpath,
                           params, query or bquery, fragment))
    i = bpath.rfind('/')
    if i >= 0:
        path = bpath[:i] + '/' + path
    segments = path.split('/')
    if segments[-1] == '.':
        segments[-1] = ''
    while '.' in segments:
        segments.remove('.')
    while True:
        i = 1
        n = len(segments) - 1
        while i < n:
            if segments[i] == '..' and segments[i - 1]:
                del segments[i - 1:i + 1]
                break
            i = i + 1
        else:
            break
    if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
        segments[-1] = ''
    elif len(segments) >= 2 and segments[-1] == '..':
        segments[-2:] = ['']
    return urlunparse((scheme, netloc, "/".join(segments),
                       params, query, fragment))


[docs]def urldefrag(url):
    """Removes any existing fragment from URL.

    Returns
    -------

       a tuple of the defragmented URL and the fragment.
       If the URL contained no fragments, the second element is the empty string.
    """
    s, n, p, a, q, frag = urlparse(url)
    defrag = urlunparse((s, n, p, a, q, ''))
    return defrag, frag


test_input = """
      http://a/b/c/d

      g:h        = <URL:g:h>
      http:g     = <URL:http://a/b/c/g>
      http:      = <URL:http://a/b/c/d>
      g          = <URL:http://a/b/c/g>
      ./g        = <URL:http://a/b/c/g>
      g/         = <URL:http://a/b/c/g/>
      /g         = <URL:http://a/g>
      //g        = <URL:http://g>
      ?y         = <URL:http://a/b/c/d?y>
      g?y        = <URL:http://a/b/c/g?y>
      g?y/./x    = <URL:http://a/b/c/g?y/./x>
      .          = <URL:http://a/b/c/>
      ./         = <URL:http://a/b/c/>
      ..         = <URL:http://a/b/>
      ../        = <URL:http://a/b/>
      ../g       = <URL:http://a/b/g>
      ../..      = <URL:http://a/>
      ../../g    = <URL:http://a/g>
      ../../../g = <URL:http://a/../g>
      ./../g     = <URL:http://a/b/g>
      ./g/.      = <URL:http://a/b/c/g/>
      /./g       = <URL:http://a/./g>
      g/./h      = <URL:http://a/b/c/g/h>
      g/../h     = <URL:http://a/b/c/h>
      http:g     = <URL:http://a/b/c/g>
      http:      = <URL:http://a/b/c/d>
      http:?y         = <URL:http://a/b/c/d?y>
      http:g?y        = <URL:http://a/b/c/g?y>
      http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
"""
# XXX The result for //g is actually http://g/; is this a problem?


[docs]def test():
    import sys
    base = ''
    if sys.argv[1:]:
        fn = sys.argv[1]
        if fn == '-':
            fp = sys.stdin
        else:
            fp = open(fn)
    else:
        import io
        fp = io.StringIO(test_input)
    while True:
        line = fp.readline()
        if not line:
            break
        words = string.split(line)
        if not words:
            continue
        url = words[0]
        parts = urlparse(url)
        print('%-10s : %s' % (url, parts))
        abs = urljoin(base, url)
        if not base:
            base = abs
        wrapped = '<URL:%s>' % abs
        print('%-10s = %s' % (url, wrapped))
        if len(words) == 3 and words[1] == '=':
            if wrapped != words[2]:
                print('EXPECTED', words[2], '!!!!!!!!!!')


if __name__ == '__main__':
    test()
Source code for cdms2.cdurlparse

Table of Contents

Search