1 files changed, 275 insertions, 0 deletions
diff --git a/src/urlregex.c b/src/urlregex.c
new file mode 100644
index 0000000..3bab8c1
--- /dev/null
+++ b/src/urlregex.c
@@ -0,0 +1,275 @@
+#include <string.h>
+#include "urlregex.h"
+
+#define LP_BUG_BASE_URL "https://bugs.launchpad.net/bugs/"
+#define HTTP_BASE_URL "http://"
+#define MAILTO_BASE_URL "mailto:"
+
+/* Adapted from src/terminal-screen.c in the gnome-terminal source */
+
+#define USERCHARS "-[:alnum:]"
+#define USERCHARS_CLASS "[" USERCHARS "]"
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+#define HOSTCHARS_CLASS "[-[:alnum:]]"
+#define HOST HOSTCHARS_CLASS "+(\\." HOSTCHARS_CLASS "+)*"
+#define PORT "(?:\\:[[:digit:]]{1,5})?"
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#%\\E]"
+#define PATHTERM_CLASS "[^\\Q]'.:}>) \t\r\n,\"\\E]"
+#define SCHEME "(?:news:|telnet:|nntp:|file:\\/|https?:|ftps?:|sftp:|webcal:)"
+#define USERPASS USERCHARS_CLASS "+(?:" PASSCHARS_CLASS "+)?"
+#define URLPATH   "(?:(/"PATHCHARS_CLASS"+(?:[(]"PATHCHARS_CLASS"*[)])*"PATHCHARS_CLASS"*)*"PATHTERM_CLASS")?"
+
+typedef enum {
+  FLAVOR_AS_IS,
+  FLAVOR_DEFAULT_TO_HTTP,
+  FLAVOR_EMAIL,
+  FLAVOR_LP
+} UrlRegexFlavor;
+
+typedef struct {
+  const char        *pattern;
+  UrlRegexFlavor     flavor;
+  GRegexCompileFlags flags;
+} UrlRegexPattern;
+
+static UrlRegexPattern url_regex_patterns[] = {
+  { SCHEME "//(?:" USERPASS "\\@)?" HOST PORT URLPATH, FLAVOR_AS_IS, G_REGEX_CASELESS },
+  { "(?:www|ftp)" HOSTCHARS_CLASS "*\\." HOST PORT URLPATH, FLAVOR_DEFAULT_TO_HTTP, G_REGEX_CASELESS},
+  { "(?:mailto:)?" USERCHARS_CLASS "[" USERCHARS ".]*\\@" HOSTCHARS_CLASS "+\\." HOST, FLAVOR_EMAIL, G_REGEX_CASELESS  },
+  { "(?:lp: #)([[:digit:]]+)", FLAVOR_LP, G_REGEX_CASELESS}
+};
+
+static GRegex         **url_regexes;
+static UrlRegexFlavor  *url_regex_flavors;
+static guint            n_url_regexes;
+
+static char *urlregex_expand(GMatchInfo *match_info, UrlRegexFlavor flavor);
+
+/**
+ * urlregex_init:
+ *
+ * Compiles all of the url matching regular expressions.
+ **/
+void
+urlregex_init(void)
+{
+  guint i;
+
+  n_url_regexes = G_N_ELEMENTS(url_regex_patterns);
+  url_regexes = g_new0(GRegex*, n_url_regexes);
+  url_regex_flavors = g_new0(UrlRegexFlavor, n_url_regexes);
+
+  for (i = 0; i < n_url_regexes; i++) {
+    GError *error = NULL;
+
+    url_regexes[i] = g_regex_new(url_regex_patterns[i].pattern,
+        url_regex_patterns[i].flags | G_REGEX_OPTIMIZE, 0, &error);
+
+    if (error != NULL) {
+      g_message("%s", error->message);
+      g_error_free(error);
+    }
+
+    url_regex_flavors[i] = url_regex_patterns[i].flavor;
+  }
+}
+
+/**
+ * urlregex_count:
+ *
+ * Returns the number of available url patterns.
+ **/
+guint
+urlregex_count(void)
+{
+  return n_url_regexes;
+}
+
+/**
+ * urlregex_split:
+ * @text: the text to split
+ * @index: the pattern to use
+ *
+ * Splits the text into a list of MatchGroup objects.
+ **/
+GList *
+urlregex_split(const char *text, guint index)
+{
+  GList *result = NULL;
+  GRegex *pattern = url_regexes[index];
+  GMatchInfo *match_info;
+  int text_length = strlen(text);
+
+  int start_pos = 0;
+  int end_pos = 0;
+  int last_pos = 0;
+  int len = 0;
+
+  gchar *token;
+  gchar *expanded;
+
+  g_regex_match(pattern, text, 0, &match_info);
+
+  while (g_match_info_matches(match_info)) {
+    /* Append previously unmatched text */
+    g_match_info_fetch_pos(match_info, 0, &start_pos, &end_pos);
+    len = start_pos - last_pos;
+    if (len > 0) {
+      token = g_strndup(text + last_pos, len);
+      result = g_list_append(result, urlregex_matchgroup_new(token, token, NOT_MATCHED));
+      g_free(token);
+    }
+
+    /* Append matched text */
+    token = urlregex_expand(match_info, FLAVOR_AS_IS);
+    expanded = urlregex_expand(match_info, url_regex_flavors[index]);
+    result = g_list_append(result, urlregex_matchgroup_new(token, expanded, MATCHED));
+    g_free(token);
+    g_free(expanded);
+
+    g_match_info_next(match_info, NULL);
+    last_pos = end_pos;
+  }
+  /* Append the text after the last match */
+  if (last_pos < text_length) {
+    token = g_strdup(text + last_pos);
+    result = g_list_append(result, urlregex_matchgroup_new(token, token, NOT_MATCHED));
+    g_free(token);
+  }
+
+  g_match_info_free(match_info);
+
+  return result;
+}
+
+/**
+ * urlregex_expand:
+ * @match_info: describes the matched url
+ * @flavor: the type of url
+ *
+ * Expands the matched url based on the given flavor.
+ **/
+static char *
+urlregex_expand(GMatchInfo *match_info, UrlRegexFlavor flavor)
+{
+  char *t1;
+  char *t2;
+
+  switch(flavor) {
+    case FLAVOR_DEFAULT_TO_HTTP:
+      t1 = g_match_info_fetch(match_info, 0);
+      t2 = g_strconcat(HTTP_BASE_URL, t1, NULL);
+      g_free(t1);
+      return t2;
+    case FLAVOR_EMAIL:
+      t1 = g_match_info_fetch(match_info, 0);
+      if (!g_str_has_prefix(t1, MAILTO_BASE_URL)) {
+        t2 = g_strconcat(MAILTO_BASE_URL, t1, NULL);
+        g_free(t1);
+        return t2;
+      }
+      else
+        return t1;
+    case FLAVOR_LP:
+      t1 = g_match_info_fetch(match_info, 1);
+      t2 = g_strconcat(LP_BUG_BASE_URL, t1, NULL);
+      g_free(t1);
+      return t2;
+    default:
+      return g_match_info_fetch(match_info, 0);
+  }
+}
+
+/**
+ * urlregex_split_all:
+ * @text: the text to split
+ *
+ * Splits the text into a list of MatchGroup objects, applying each url pattern
+ * available in order to each of the unmatched sections, keeping the list flat.
+ **/
+GList *
+urlregex_split_all(const char *text)
+{
+  GList *result = NULL;
+  GList *temp = NULL;
+  guint i;
+
+  result = g_list_append(result, urlregex_matchgroup_new(text, text, NOT_MATCHED));
+
+  /* Apply each regex in order to sections that haven't yet been matched */
+  for (i = 0; i < n_url_regexes; i++) {
+    GList *item;
+    temp = NULL;
+    for (item = result; item; item = item->next) {
+      MatchGroup *group = (MatchGroup *)item->data;
+      if (group->type == NOT_MATCHED) {
+        GList *list = urlregex_split(group->text, i);
+        GList *subitem;
+        for (subitem = list; subitem; subitem = subitem->next) {
+          MatchGroup *subgroup = (MatchGroup *)subitem->data;
+          temp = g_list_append(temp, subgroup);
+        }
+        g_list_free(list);
+        urlregex_matchgroup_free(group);
+      }
+      else {
+        temp = g_list_append(temp, group);
+      }
+    }
+    g_list_free(result);
+    result = temp;
+  }
+
+  return result;
+}
+
+/**
+ * urlregex_matchgroup_new:
+ * @text: the original text
+ * @expanded: the expanded url
+ * @type: whether this is a matched or unmatched group
+ *
+ * Creates a new MatchGroup object.
+ **/
+MatchGroup *
+urlregex_matchgroup_new(const char *text, const char *expanded, MatchType type)
+{
+  MatchGroup *result = g_new0(MatchGroup, 1);
+  result->text = g_strdup(text);
+  /* TODO: Save space using same data if text == expanded? */
+  result->expanded = g_strdup(expanded);
+  result->type = type;
+  return result;
+}
+
+/**
+ * urlregex_matchgroup_free:
+ * @group: the match group
+ *
+ * Frees the MatchGroup object.
+ **/
+void
+urlregex_matchgroup_free(MatchGroup *group)
+{
+  g_free(group->expanded);
+  group->expanded = NULL;
+  g_free(group->text);
+  group->text = NULL;
+  g_free(group);
+}
+
+/**
+ * urlregex_matchgroup_list_free:
+ * @list: the match group list
+ *
+ * Frees a list of MatchGroup objects returned from split or split_all.
+ **/
+void
+urlregex_matchgroup_list_free(GList *list)
+{
+  GList *item;
+  for (item = list; item; item = item->next) {
+    urlregex_matchgroup_free((MatchGroup *)item->data);
+  }
+  g_list_free(list);
+}