summaryrefslogtreecommitdiffstats
path: root/src/css-url.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/css-url.c')
-rw-r--r--src/css-url.c235
1 files changed, 235 insertions, 0 deletions
diff --git a/src/css-url.c b/src/css-url.c
new file mode 100644
index 0000000..20abfec
--- /dev/null
+++ b/src/css-url.c
@@ -0,0 +1,235 @@
+/* Collect URLs from CSS source.
+ Copyright (C) 1998, 2000-2003, 2009-2011, 2014-2015, 2018-2023 Free
+ Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+/*
+ Note that this is not an actual CSS parser, but just a lexical
+ scanner with a tiny bit more smarts bolted on top. A full parser
+ is somewhat overkill for this job. The only things we're interested
+ in are @import rules and url() tokens, so it's easy enough to
+ grab those without truly understanding the input. The only downside
+ to this is that we might be coerced into downloading files that
+ a browser would ignore. That might merit some more investigation.
+ */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "utils.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-tokens.h"
+#include "css-url.h"
+#include "xstrndup.h"
+
+/* from lex.yy.c */
+extern char *yytext;
+extern int yyleng;
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len );
+extern void yy_delete_buffer (YY_BUFFER_STATE b);
+extern int yylex (void);
+extern void yylex_destroy(void);
+
+/*
+ Given a detected URI token, get only the URI specified within.
+ Also adjust the starting position and length of the string.
+
+ A URI can be specified with or without quotes, and the quotes
+ can be single or double quotes. In addition there can be
+ whitespace after the opening parenthesis and before the closing
+ parenthesis.
+*/
+static char *
+get_uri_string (const char *at, int *pos, int *length)
+{
+ if (*length < 4)
+ return NULL;
+
+ if (0 != strncasecmp (at + *pos, "url(", 4))
+ return NULL;
+
+ *pos += 4;
+ *length -= 5; /* url() */
+
+ /* skip leading space */
+ while (*length > 0 && isspace (at[*pos]))
+ {
+ (*pos)++;
+ if (--(*length) == 0)
+ return NULL;
+ }
+
+ /* skip trailing space */
+ while (*length > 0 && isspace (at[*pos + *length - 1]))
+ {
+ (*length)--;
+ }
+
+ /* trim off quotes */
+ if (*length >= 2 && (at[*pos] == '\'' || at[*pos] == '"'))
+ {
+ (*pos)++;
+ *length -= 2;
+ }
+
+ if (*length <= 0)
+ return NULL;
+
+ return xstrndup (at + *pos, *length);
+}
+
+void
+get_urls_css (struct map_context *ctx, int offset, int buf_length)
+{
+ int token;
+ /*char tmp[2048];*/
+ int buffer_pos = 0;
+ int pos, length;
+ char *uri;
+ YY_BUFFER_STATE b;
+
+ /* tell flex to scan from this buffer */
+ b = yy_scan_bytes (ctx->text + offset, buf_length);
+
+ while((token = yylex()) != CSSEOF)
+ {
+ /*DEBUGP (("%s ", token_names[token]));*/
+ /* @import "foo.css"
+ or @import url(foo.css)
+ */
+ if(token == IMPORT_SYM)
+ {
+ do {
+ buffer_pos += yyleng;
+ } while((token = yylex()) == S);
+
+ /*DEBUGP (("%s ", token_names[token]));*/
+
+ if (token == STRING || token == URI)
+ {
+ /*DEBUGP (("Got URI "));*/
+ pos = buffer_pos + offset;
+ length = yyleng;
+
+ if (token == URI)
+ {
+ uri = get_uri_string (ctx->text, &pos, &length);
+ }
+ else if (length >= 2)
+ {
+ /* cut out quote characters */
+ pos++;
+ length -= 2;
+ uri = xmalloc (length + 1);
+ memcpy (uri, yytext + 1, length);
+ uri[length] = '\0';
+ }
+ else
+ uri = NULL;
+
+ if (uri)
+ {
+ struct urlpos *up = append_url (uri, pos, length, ctx);
+ DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+
+ if (up)
+ {
+ up->link_inline_p = 1;
+ up->link_css_p = 1;
+ up->link_expect_css = 1;
+ }
+
+ xfree(uri);
+ }
+ }
+ }
+ /* background-image: url(foo.png)
+ note that we don't care what
+ property this is actually on.
+ */
+ else if(token == URI)
+ {
+ pos = buffer_pos + offset;
+ length = yyleng;
+ uri = get_uri_string (ctx->text, &pos, &length);
+
+ if (uri)
+ {
+ struct urlpos *up = append_url (uri, pos, length, ctx);
+ DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+ if (up)
+ {
+ up->link_inline_p = 1;
+ up->link_css_p = 1;
+ }
+
+ xfree (uri);
+ }
+ }
+ buffer_pos += yyleng;
+ }
+
+ yy_delete_buffer(b);
+ yylex_destroy();
+
+ DEBUGP (("\n"));
+}
+
+struct urlpos *
+get_urls_css_file (const char *file, const char *url)
+{
+ struct file_memory *fm;
+ struct map_context ctx;
+
+ /* Load the file. */
+ fm = wget_read_file (file);
+ if (!fm)
+ {
+ logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+ return NULL;
+ }
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+
+ ctx.text = fm->content;
+ ctx.head = NULL;
+ ctx.base = NULL;
+ ctx.parent_base = url ? url : opt.base_href;
+ ctx.document_file = file;
+ ctx.nofollow = 0;
+
+ get_urls_css (&ctx, 0, fm->length);
+ wget_read_file_free (fm);
+ return ctx.head;
+}