Rework url matching to make it easier to add schemes.
The new way allows great control of what a url is composed of. Added a bunch of new schemes.
This commit is contained in:
parent
6bc05a8bc8
commit
805b33552b
161
src/common/url.c
161
src/common/url.c
|
@ -314,8 +314,8 @@ do_an_re(const char *word,int *start, int *end, int *type)
|
||||||
} func_t;
|
} func_t;
|
||||||
func_t funcs[] =
|
func_t funcs[] =
|
||||||
{
|
{
|
||||||
{ re_email, WORD_EMAIL },
|
|
||||||
{ re_url, WORD_URL },
|
{ re_url, WORD_URL },
|
||||||
|
{ re_email, WORD_EMAIL },
|
||||||
{ re_channel, WORD_CHANNEL },
|
{ re_channel, WORD_CHANNEL },
|
||||||
{ re_host, WORD_HOST },
|
{ re_host, WORD_HOST },
|
||||||
{ re_path, WORD_PATH },
|
{ re_path, WORD_PATH },
|
||||||
|
@ -360,7 +360,7 @@ make_re(char *grist, char *type)
|
||||||
GRegex *ret;
|
GRegex *ret;
|
||||||
GError *err = NULL;
|
GError *err = NULL;
|
||||||
|
|
||||||
ret = g_regex_new (grist, G_REGEX_CASELESS + G_REGEX_OPTIMIZE, 0, &err);
|
ret = g_regex_new (grist, G_REGEX_CASELESS | G_REGEX_OPTIMIZE, 0, &err);
|
||||||
g_free (grist);
|
g_free (grist);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -389,60 +389,123 @@ re_host (void)
|
||||||
#define LPAR "\\("
|
#define LPAR "\\("
|
||||||
#define RPAR "\\)"
|
#define RPAR "\\)"
|
||||||
#define NOPARENS "[^() \t]*"
|
#define NOPARENS "[^() \t]*"
|
||||||
|
#define PATH \
|
||||||
|
"(" \
|
||||||
|
"(" LPAR NOPARENS RPAR ")" \
|
||||||
|
"|" \
|
||||||
|
"(" NOPARENS ")" \
|
||||||
|
")*" /* Zero or more occurrences of either of these */ \
|
||||||
|
"(?<![.,?!\\]])" /* Not allowed to end with these */
|
||||||
|
#define USERINFO "([-a-z0-9._~%]+@)"
|
||||||
|
|
||||||
char *prefix[] = {
|
/* Flags used to describe URIs (RFC 3986)
|
||||||
"irc\\.",
|
*
|
||||||
"ftp\\.",
|
* Bellow is an example of what the flags match.
|
||||||
"www\\.",
|
*
|
||||||
"irc://",
|
* URI_AUTHORITY - http://example.org:80/foo/bar
|
||||||
"ircs://",
|
* ^^^^^^^^^^^^^^^^
|
||||||
"ftp://",
|
* URI_USERINFO/URI_OPT_USERINFO - http://user@example.org:80/foo/bar
|
||||||
"http://",
|
* ^^^^^
|
||||||
"https://",
|
* URI_PATH - http://example.org:80/foo/bar
|
||||||
"file://",
|
* ^^^^^^^^
|
||||||
"rtsp://",
|
*/
|
||||||
NULL
|
#define URI_AUTHORITY (1 << 0)
|
||||||
|
#define URI_OPT_USERINFO (1 << 1)
|
||||||
|
#define URI_USERINFO (1 << 2)
|
||||||
|
#define URI_PATH (1 << 3)
|
||||||
|
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
const char *scheme; /* scheme name. e.g. http */
|
||||||
|
const char *path_sep; /* string that begins the path */
|
||||||
|
int flags; /* see above (flag definitions) */
|
||||||
|
} uri[] = {
|
||||||
|
{ "irc", "/", URI_AUTHORITY | URI_PATH },
|
||||||
|
{ "ircs", "/", URI_AUTHORITY | URI_PATH },
|
||||||
|
{ "rtsp", "/", URI_AUTHORITY | URI_PATH },
|
||||||
|
{ "feed", "/", URI_AUTHORITY | URI_PATH },
|
||||||
|
{ "teamspeak", "?", URI_AUTHORITY | URI_PATH },
|
||||||
|
{ "ftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "sftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "ftps", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "http", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "https", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "cvs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "svn", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "git", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "rsync", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "mumble", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "ventrilo", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "xmpp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "file", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "h323", ";", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "imap", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "pop", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "nfs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "smb", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
|
||||||
|
{ "ssh", "", URI_AUTHORITY | URI_OPT_USERINFO },
|
||||||
|
{ "sip", "", URI_AUTHORITY | URI_USERINFO },
|
||||||
|
{ "sips", "", URI_AUTHORITY | URI_USERINFO },
|
||||||
|
{ "magnet", "?", URI_PATH },
|
||||||
|
{ "mailto", "", URI_PATH },
|
||||||
|
{ "bitcoin", "", URI_PATH },
|
||||||
|
{ "gtalk", "", URI_PATH },
|
||||||
|
{ "steam", "", URI_PATH },
|
||||||
|
{ NULL, '\0', 0}
|
||||||
};
|
};
|
||||||
|
|
||||||
static GRegex *
|
static GRegex *
|
||||||
re_url (void)
|
re_url (void)
|
||||||
{
|
{
|
||||||
static GRegex *url_ret;
|
static GRegex *url_ret = NULL;
|
||||||
|
GString *grist_gstr;
|
||||||
char *grist;
|
char *grist;
|
||||||
char *scheme;
|
int i;
|
||||||
|
|
||||||
if (url_ret) return url_ret;
|
if (url_ret) return url_ret;
|
||||||
|
|
||||||
scheme = g_strjoinv ("|", prefix);
|
grist_gstr = g_string_new (NULL);
|
||||||
grist = g_strdup_printf (
|
|
||||||
"(" /* URL or HOST */
|
/* Add regex "host/path", representing a "schemeless" url */
|
||||||
"("
|
g_string_append (grist_gstr, "(" HOST OPT_PORT "/" "(" PATH ")?" ")");
|
||||||
SCHEME HOST OPT_PORT
|
|
||||||
"(" /* Optional "/path?query_string#fragment_id" */
|
for (i = 0; uri[i].scheme; i++)
|
||||||
"/" /* Must start with slash */
|
{
|
||||||
"("
|
g_string_append (grist_gstr, "|(");
|
||||||
"(" LPAR NOPARENS RPAR ")"
|
g_string_append_printf (grist_gstr, "%s:", uri[i].scheme);
|
||||||
"|"
|
|
||||||
"(" NOPARENS ")"
|
if (uri[i].flags & URI_AUTHORITY)
|
||||||
")*" /* Zero or more occurrences of either of these */
|
g_string_append (grist_gstr, "//");
|
||||||
"(?<![.,?!\\]])" /* Not allowed to end with these */
|
|
||||||
")?" /* Zero or one of this /path?query_string#fragment_id thing */
|
if (uri[i].flags & URI_USERINFO)
|
||||||
")|("
|
g_string_append (grist_gstr, USERINFO);
|
||||||
HOST OPT_PORT "/"
|
else if (uri[i].flags & URI_OPT_USERINFO)
|
||||||
"(" /* Optional "path?query_string#fragment_id" */
|
g_string_append (grist_gstr, USERINFO "?");
|
||||||
"("
|
|
||||||
"(" LPAR NOPARENS RPAR ")"
|
if (uri[i].flags & URI_AUTHORITY)
|
||||||
"|"
|
g_string_append (grist_gstr, HOST OPT_PORT);
|
||||||
"(" NOPARENS ")"
|
|
||||||
")*" /* Zero or more occurrences of either of these */
|
if (uri[i].flags & URI_PATH)
|
||||||
"(?<![.,?!\\]])" /* Not allowed to end with these */
|
{
|
||||||
")?" /* Zero or one of this /path?query_string#fragment_id thing */
|
char *sep_escaped;
|
||||||
")"
|
|
||||||
")"
|
sep_escaped = g_regex_escape_string (uri[i].path_sep,
|
||||||
, scheme
|
strlen(uri[i].path_sep));
|
||||||
);
|
|
||||||
|
g_string_append_printf(grist_gstr, "(" "%s" PATH ")?",
|
||||||
|
sep_escaped);
|
||||||
|
|
||||||
|
g_free(sep_escaped);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
g_string_append(grist_gstr, ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
grist = g_string_free (grist_gstr, FALSE);
|
||||||
|
|
||||||
url_ret = make_re (grist, "re_url");
|
url_ret = make_re (grist, "re_url");
|
||||||
g_free (scheme);
|
|
||||||
return url_ret;
|
return url_ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -525,10 +588,10 @@ re_channel (void)
|
||||||
/* PATH description --- */
|
/* PATH description --- */
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
/* Windows path can be .\ ..\ or C: D: etc */
|
/* Windows path can be .\ ..\ or C: D: etc */
|
||||||
#define PATH "^(\\.{1,2}\\\\|[a-z]:).*"
|
#define FS_PATH "^(\\.{1,2}\\\\|[a-z]:).*"
|
||||||
#else
|
#else
|
||||||
/* Linux path can be / or ./ or ../ etc */
|
/* Linux path can be / or ./ or ../ etc */
|
||||||
#define PATH "^(/|\\./|\\.\\./).*"
|
#define FS_PATH "^(/|\\./|\\.\\./).*"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static GRegex *
|
static GRegex *
|
||||||
|
@ -540,8 +603,8 @@ re_path (void)
|
||||||
if (path_ret) return path_ret;
|
if (path_ret) return path_ret;
|
||||||
|
|
||||||
grist = g_strdup_printf (
|
grist = g_strdup_printf (
|
||||||
"(" /* PATH */
|
"(" /* FS_PATH */
|
||||||
PATH
|
FS_PATH
|
||||||
")"
|
")"
|
||||||
);
|
);
|
||||||
path_ret = make_re (grist, "re_path");
|
path_ret = make_re (grist, "re_path");
|
||||||
|
|
Loading…
Reference in New Issue