/*! @license * Copyright 2008 The Closure Library Authors * SPDX-License-Identifier: Apache-2.0 */ /** * @fileoverview Simple utilities for splitting URI strings. * * Uses features of RFC 3986 for parsing/formatting URIs: * http://www.ietf.org/rfc/rfc3986.txt * * @author gboyer@google.com (Garrett Boyer) - The "lightened" design. * @author msamuel@google.com (Mike Samuel) - Domain knowledge and regexes. */ goog.provide('goog.uri.utils'); goog.provide('goog.uri.utils.ComponentIndex'); /** * A regular expression for breaking a URI into its component parts. * * {@link http://www.ietf.org/rfc/rfc3986.txt} says in Appendix B * As the "first-match-wins" algorithm is identical to the "greedy" * disambiguation method used by POSIX regular expressions, it is natural and * commonplace to use a regular expression for parsing the potential five * components of a URI reference. * * The following line is the regular expression for breaking-down a * well-formed URI reference into its components. * *
 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 *  12            3  4          5       6  7        8 9
 * 
* * The numbers in the second line above are only to assist readability; they * indicate the reference points for each subexpression (i.e., each paired * parenthesis). We refer to the value matched for subexpression as $. * For example, matching the above expression to *
 *     http://www.ics.uci.edu/pub/ietf/uri/#Related
 * 
* results in the following subexpression matches: *
 *    $1 = http:
 *    $2 = http
 *    $3 = //www.ics.uci.edu
 *    $4 = www.ics.uci.edu
 *    $5 = /pub/ietf/uri/
 *    $6 = 
 *    $7 = 
 *    $8 = #Related
 *    $9 = Related
 * 
* where indicates that the component is not present, as is the * case for the query component in the above example. Therefore, we can * determine the value of the five components as *
 *    scheme    = $2
 *    authority = $4
 *    path      = $5
 *    query     = $7
 *    fragment  = $9
 * 
* * The regular expression has been modified slightly to expose the * userInfo, domain, and port separately from the authority. * The modified version yields *
 *    $1 = http              scheme
 *    $2 =        userInfo -\
 *    $3 = www.ics.uci.edu   domain     | authority
 *    $4 =        port     -/
 *    $5 = /pub/ietf/uri/    path
 *    $6 =        query without ?
 *    $7 = Related           fragment without #
 * 
* @type {!RegExp} * @private */ goog.uri.utils.splitRe_ = new RegExp( '^' + '(?:' + '([^:/?#.]+)' + // scheme - ignore special characters // used by other URL parts such as :, // ?, /, #, and . ':)?' + '(?://' + '(?:([^/?#]*)@)?' + // userInfo '([^/#?]*?)' + // domain '(?::([0-9]+))?' + // port '(?=[/#?]|$)' + // authority-terminating character ')?' + '([^?#]+)?' + // path '(?:\\?([^#]*))?' + // query '(?:#(.*))?' + // fragment '$'); /** * The index of each URI component in the return value of goog.uri.utils.split. * @enum {number} */ goog.uri.utils.ComponentIndex = { SCHEME: 1, USER_INFO: 2, DOMAIN: 3, PORT: 4, PATH: 5, QUERY_DATA: 6, FRAGMENT: 7 }; /** * Splits a URI into its component parts. * * Each component can be accessed via the component indices; for example: *
 * goog.uri.utils.split(someStr)[goog.uri.utils.CompontentIndex.QUERY_DATA];
 * 
* * @param {string} uri The URI string to examine. * @return {!Array.} Each component still URI-encoded. * Each component that is present will contain the encoded value, whereas * components that are not present will be undefined or empty, depending * on the browser's regular expression implementation. Never null, since * arbitrary strings may still look like path names. */ goog.uri.utils.split = function(uri) { // See @return comment -- never null. return /** @type {!Array.} */ ( uri.match(goog.uri.utils.splitRe_)); };