Rework url matching to make it easier to add schemes.

The new way allows great control of what a url is composed of.

Added a bunch of new schemes.
This commit is contained in:
Diogo Sousa 2013-06-16 15:36:52 +01:00
parent 6bc05a8bc8
commit 805b33552b
1 changed files with 112 additions and 49 deletions

View File

@ -314,8 +314,8 @@ do_an_re(const char *word,int *start, int *end, int *type)
} func_t; } func_t;
func_t funcs[] = func_t funcs[] =
{ {
{ re_email, WORD_EMAIL },
{ re_url, WORD_URL }, { re_url, WORD_URL },
{ re_email, WORD_EMAIL },
{ re_channel, WORD_CHANNEL }, { re_channel, WORD_CHANNEL },
{ re_host, WORD_HOST }, { re_host, WORD_HOST },
{ re_path, WORD_PATH }, { re_path, WORD_PATH },
@ -360,7 +360,7 @@ make_re(char *grist, char *type)
GRegex *ret; GRegex *ret;
GError *err = NULL; GError *err = NULL;
ret = g_regex_new (grist, G_REGEX_CASELESS + G_REGEX_OPTIMIZE, 0, &err); ret = g_regex_new (grist, G_REGEX_CASELESS | G_REGEX_OPTIMIZE, 0, &err);
g_free (grist); g_free (grist);
return ret; return ret;
} }
@ -389,60 +389,123 @@ re_host (void)
#define LPAR "\\(" #define LPAR "\\("
#define RPAR "\\)" #define RPAR "\\)"
#define NOPARENS "[^() \t]*" #define NOPARENS "[^() \t]*"
#define PATH \
"(" \
"(" LPAR NOPARENS RPAR ")" \
"|" \
"(" NOPARENS ")" \
")*" /* Zero or more occurrences of either of these */ \
"(?<![.,?!\\]])" /* Not allowed to end with these */
#define USERINFO "([-a-z0-9._~%]+@)"
char *prefix[] = { /* Flags used to describe URIs (RFC 3986)
"irc\\.", *
"ftp\\.", * Bellow is an example of what the flags match.
"www\\.", *
"irc://", * URI_AUTHORITY - http://example.org:80/foo/bar
"ircs://", * ^^^^^^^^^^^^^^^^
"ftp://", * URI_USERINFO/URI_OPT_USERINFO - http://user@example.org:80/foo/bar
"http://", * ^^^^^
"https://", * URI_PATH - http://example.org:80/foo/bar
"file://", * ^^^^^^^^
"rtsp://", */
NULL #define URI_AUTHORITY (1 << 0)
#define URI_OPT_USERINFO (1 << 1)
#define URI_USERINFO (1 << 2)
#define URI_PATH (1 << 3)
struct
{
const char *scheme; /* scheme name. e.g. http */
const char *path_sep; /* string that begins the path */
int flags; /* see above (flag definitions) */
} uri[] = {
{ "irc", "/", URI_AUTHORITY | URI_PATH },
{ "ircs", "/", URI_AUTHORITY | URI_PATH },
{ "rtsp", "/", URI_AUTHORITY | URI_PATH },
{ "feed", "/", URI_AUTHORITY | URI_PATH },
{ "teamspeak", "?", URI_AUTHORITY | URI_PATH },
{ "ftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "sftp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "ftps", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "http", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "https", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "cvs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "svn", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "git", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "rsync", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "mumble", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "ventrilo", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "xmpp", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "file", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "h323", ";", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "imap", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "pop", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "nfs", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "smb", "/", URI_AUTHORITY | URI_OPT_USERINFO | URI_PATH },
{ "ssh", "", URI_AUTHORITY | URI_OPT_USERINFO },
{ "sip", "", URI_AUTHORITY | URI_USERINFO },
{ "sips", "", URI_AUTHORITY | URI_USERINFO },
{ "magnet", "?", URI_PATH },
{ "mailto", "", URI_PATH },
{ "bitcoin", "", URI_PATH },
{ "gtalk", "", URI_PATH },
{ "steam", "", URI_PATH },
{ NULL, '\0', 0}
}; };
static GRegex * static GRegex *
re_url (void) re_url (void)
{ {
static GRegex *url_ret; static GRegex *url_ret = NULL;
GString *grist_gstr;
char *grist; char *grist;
char *scheme; int i;
if (url_ret) return url_ret; if (url_ret) return url_ret;
scheme = g_strjoinv ("|", prefix); grist_gstr = g_string_new (NULL);
grist = g_strdup_printf (
"(" /* URL or HOST */ /* Add regex "host/path", representing a "schemeless" url */
"(" g_string_append (grist_gstr, "(" HOST OPT_PORT "/" "(" PATH ")?" ")");
SCHEME HOST OPT_PORT
"(" /* Optional "/path?query_string#fragment_id" */ for (i = 0; uri[i].scheme; i++)
"/" /* Must start with slash */ {
"(" g_string_append (grist_gstr, "|(");
"(" LPAR NOPARENS RPAR ")" g_string_append_printf (grist_gstr, "%s:", uri[i].scheme);
"|"
"(" NOPARENS ")" if (uri[i].flags & URI_AUTHORITY)
")*" /* Zero or more occurrences of either of these */ g_string_append (grist_gstr, "//");
"(?<![.,?!\\]])" /* Not allowed to end with these */
")?" /* Zero or one of this /path?query_string#fragment_id thing */ if (uri[i].flags & URI_USERINFO)
")|(" g_string_append (grist_gstr, USERINFO);
HOST OPT_PORT "/" else if (uri[i].flags & URI_OPT_USERINFO)
"(" /* Optional "path?query_string#fragment_id" */ g_string_append (grist_gstr, USERINFO "?");
"("
"(" LPAR NOPARENS RPAR ")" if (uri[i].flags & URI_AUTHORITY)
"|" g_string_append (grist_gstr, HOST OPT_PORT);
"(" NOPARENS ")"
")*" /* Zero or more occurrences of either of these */ if (uri[i].flags & URI_PATH)
"(?<![.,?!\\]])" /* Not allowed to end with these */ {
")?" /* Zero or one of this /path?query_string#fragment_id thing */ char *sep_escaped;
")"
")" sep_escaped = g_regex_escape_string (uri[i].path_sep,
, scheme strlen(uri[i].path_sep));
);
g_string_append_printf(grist_gstr, "(" "%s" PATH ")?",
sep_escaped);
g_free(sep_escaped);
}
g_string_append(grist_gstr, ")");
}
grist = g_string_free (grist_gstr, FALSE);
url_ret = make_re (grist, "re_url"); url_ret = make_re (grist, "re_url");
g_free (scheme);
return url_ret; return url_ret;
} }
@ -525,10 +588,10 @@ re_channel (void)
/* PATH description --- */ /* PATH description --- */
#ifdef WIN32 #ifdef WIN32
/* Windows path can be .\ ..\ or C: D: etc */ /* Windows path can be .\ ..\ or C: D: etc */
#define PATH "^(\\.{1,2}\\\\|[a-z]:).*" #define FS_PATH "^(\\.{1,2}\\\\|[a-z]:).*"
#else #else
/* Linux path can be / or ./ or ../ etc */ /* Linux path can be / or ./ or ../ etc */
#define PATH "^(/|\\./|\\.\\./).*" #define FS_PATH "^(/|\\./|\\.\\./).*"
#endif #endif
static GRegex * static GRegex *
@ -540,8 +603,8 @@ re_path (void)
if (path_ret) return path_ret; if (path_ret) return path_ret;
grist = g_strdup_printf ( grist = g_strdup_printf (
"(" /* PATH */ "(" /* FS_PATH */
PATH FS_PATH
")" ")"
); );
path_ret = make_re (grist, "re_path"); path_ret = make_re (grist, "re_path");