1 files changed, 235 insertions, 0 deletions
diff --git a/src/css-url.c b/src/css-url.c
new file mode 100644
index 0000000..c6e18bb
--- /dev/null
+++ b/src/css-url.c
@@ -0,0 +1,235 @@
+/* Collect URLs from CSS source.
+   Copyright (C) 1998, 2000-2003, 2009-2011, 2014-2015, 2018-2022 Free
+   Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
+/*
+  Note that this is not an actual CSS parser, but just a lexical
+  scanner with a tiny bit more smarts bolted on top.  A full parser
+  is somewhat overkill for this job.  The only things we're interested
+  in are @import rules and url() tokens, so it's easy enough to
+  grab those without truly understanding the input.  The only downside
+  to this is that we might be coerced into downloading files that
+  a browser would ignore.  That might merit some more investigation.
+ */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "utils.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-tokens.h"
+#include "css-url.h"
+#include "xstrndup.h"
+
+/* from lex.yy.c */
+extern char *yytext;
+extern int yyleng;
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len  );
+extern void yy_delete_buffer (YY_BUFFER_STATE  b);
+extern int yylex (void);
+extern void yylex_destroy(void);
+
+/*
+  Given a detected URI token, get only the URI specified within.
+  Also adjust the starting position and length of the string.
+
+  A URI can be specified with or without quotes, and the quotes
+  can be single or double quotes.  In addition there can be
+  whitespace after the opening parenthesis and before the closing
+  parenthesis.
+*/
+static char *
+get_uri_string (const char *at, int *pos, int *length)
+{
+  if (*length < 4)
+    return NULL;
+
+  if (0 != strncasecmp (at + *pos, "url(", 4))
+    return NULL;
+
+  *pos += 4;
+  *length -= 5; /* url() */
+
+  /* skip leading space */
+  while (*length > 0 && isspace (at[*pos]))
+    {
+      (*pos)++;
+      if (--(*length) == 0)
+        return NULL;
+    }
+
+  /* skip trailing space */
+  while (*length > 0 && isspace (at[*pos + *length - 1]))
+    {
+      (*length)--;
+    }
+
+  /* trim off quotes */
+  if (*length >= 2 && (at[*pos] == '\'' || at[*pos] == '"'))
+    {
+      (*pos)++;
+      *length -= 2;
+    }
+
+  if (*length <= 0)
+    return NULL;
+
+  return xstrndup (at + *pos, *length);
+}
+
+void
+get_urls_css (struct map_context *ctx, int offset, int buf_length)
+{
+  int token;
+  /*char tmp[2048];*/
+  int buffer_pos = 0;
+  int pos, length;
+  char *uri;
+  YY_BUFFER_STATE b;
+
+  /* tell flex to scan from this buffer */
+  b = yy_scan_bytes (ctx->text + offset, buf_length);
+
+  while((token = yylex()) != CSSEOF)
+    {
+      /*DEBUGP (("%s ", token_names[token]));*/
+      /* @import "foo.css"
+         or @import url(foo.css)
+      */
+      if(token == IMPORT_SYM)
+        {
+          do {
+            buffer_pos += yyleng;
+          } while((token = yylex()) == S);
+
+          /*DEBUGP (("%s ", token_names[token]));*/
+
+          if (token == STRING || token == URI)
+            {
+              /*DEBUGP (("Got URI "));*/
+              pos = buffer_pos + offset;
+              length = yyleng;
+
+              if (token == URI)
+                {
+                  uri = get_uri_string (ctx->text, &pos, &length);
+                }
+              else if (length >= 2)
+                {
+                  /* cut out quote characters */
+                  pos++;
+                  length -= 2;
+                  uri = xmalloc (length + 1);
+                  memcpy (uri, yytext + 1, length);
+                  uri[length] = '\0';
+                }
+              else
+                uri = NULL;
+
+              if (uri)
+                {
+                  struct urlpos *up = append_url (uri, pos, length, ctx);
+                  DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+
+                  if (up)
+                    {
+                      up->link_inline_p = 1;
+                      up->link_css_p = 1;
+                      up->link_expect_css = 1;
+                    }
+
+                  xfree(uri);
+                }
+            }
+        }
+      /* background-image: url(foo.png)
+         note that we don't care what
+         property this is actually on.
+      */
+      else if(token == URI)
+        {
+          pos = buffer_pos + offset;
+          length = yyleng;
+          uri = get_uri_string (ctx->text, &pos, &length);
+
+          if (uri)
+            {
+              struct urlpos *up = append_url (uri, pos, length, ctx);
+              DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+              if (up)
+                {
+                  up->link_inline_p = 1;
+                  up->link_css_p = 1;
+                }
+
+              xfree (uri);
+            }
+        }
+      buffer_pos += yyleng;
+    }
+
+  yy_delete_buffer(b);
+  yylex_destroy();
+
+  DEBUGP (("\n"));
+}
+
+struct urlpos *
+get_urls_css_file (const char *file, const char *url)
+{
+  struct file_memory *fm;
+  struct map_context ctx;
+
+  /* Load the file. */
+  fm = wget_read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
+    }
+  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+
+  ctx.text = fm->content;
+  ctx.head = NULL;
+  ctx.base = NULL;
+  ctx.parent_base = url ? url : opt.base_href;
+  ctx.document_file = file;
+  ctx.nofollow = 0;
+
+  get_urls_css (&ctx, 0, fm->length);
+  wget_read_file_free (fm);
+  return ctx.head;
+}