summaryrefslogtreecommitdiffstats
path: root/src/res.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/res.c')
-rw-r--r--src/res.c650
1 files changed, 650 insertions, 0 deletions
diff --git a/src/res.c b/src/res.c
new file mode 100644
index 0000000..4ff228e
--- /dev/null
+++ b/src/res.c
@@ -0,0 +1,650 @@
+/* Support for Robot Exclusion Standard (RES).
+ Copyright (C) 2001, 2006-2011, 2015, 2018-2022 Free Software
+ Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+/* This file implements the Robot Exclusion Standard (RES).
+
+ RES is a simple protocol that enables site admins to signalize to
+ the web crawlers that certain parts of the site should not be
+ accessed. All the admin needs to do is create a "robots.txt" file
+ in the web server root, and use simple commands to allow or
+ disallow access to certain parts of the site.
+
+ The first specification was written by Martijn Koster in 1994, and
+ is still available at <http://www.robotstxt.org/orig.html>.
+ In 1996, Martijn wrote an Internet Draft specifying an improved RES
+ specification; however, that work was apparently abandoned since
+ the draft has expired in 1997 and hasn't been replaced since. The
+ draft is available at
+ <http://www.robotstxt.org/norobots-rfc.txt>.
+
+ This file implements RES as specified by the draft. Note that this
+ only handles the "robots.txt" support. The META tag that controls
+ whether the links should be followed is handled in `html-url.c'.
+
+ Known deviations:
+
+ * The end-of-line comment recognition is more in the spirit of the
+ Bourne Shell (as specified by RES-1994). That means that
+ "foo#bar" is taken literally, whereas "foo #bar" is interpreted
+ as "foo". The Draft apparently specifies that both should be
+ interpreted as "foo".
+
+ * We don't recognize sole CR as the line ending.
+
+ * We don't implement expiry mechanism for /robots.txt specs. I
+ consider it non-necessary for a relatively short-lived
+ application such as Wget. Besides, it is highly questionable
+ whether anyone deploys the recommended expiry scheme for
+ robots.txt.
+
+ Entry points are functions res_parse, res_parse_from_file,
+ res_match_path, res_register_specs, res_get_specs, and
+ res_retrieve_file. */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "utils.h"
+#include "hash.h"
+#include "url.h"
+#include "retr.h"
+#include "res.h"
+#include "c-strcase.h"
+
+#ifdef TESTING
+#include "../tests/unit-tests.h"
+#endif
+
+struct path_info {
+ char *path;
+ bool allowedp;
+ bool user_agent_exact_p;
+};
+
+struct robot_specs {
+ int count;
+ int size;
+ struct path_info *paths;
+};
+
+/* Parsing the robot spec. */
+
+/* Check whether AGENT (a string of length LENGTH) equals "wget" or
+ "*". If it is either of them, *matches is set to one. If it is
+ "wget", *exact_match is set to one. */
+
+static void
+match_user_agent (const char *agent, int length,
+ bool *matches, bool *exact_match)
+{
+ if (length == 1 && *agent == '*')
+ {
+ *matches = true;
+ *exact_match = false;
+ }
+ else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
+ {
+ *matches = true;
+ *exact_match = true;
+ }
+ else
+ {
+ *matches = false;
+ *exact_match = false;
+ }
+}
+
+/* Add a path specification between PATH_B and PATH_E as one of the
+ paths in SPECS. */
+
+static void
+add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
+ bool allowedp, bool exactp)
+{
+ struct path_info pp;
+ if (path_b < path_e && *path_b == '/')
+ /* Our path representation doesn't use a leading slash, so remove
+ one from theirs. */
+ ++path_b;
+ pp.path = strdupdelim (path_b, path_e);
+ pp.allowedp = allowedp;
+ pp.user_agent_exact_p = exactp;
+ ++specs->count;
+ if (specs->count > specs->size)
+ {
+ if (specs->size == 0)
+ specs->size = 1;
+ else
+ specs->size <<= 1;
+ specs->paths = xrealloc (specs->paths,
+ specs->size * sizeof (struct path_info));
+ }
+ specs->paths[specs->count - 1] = pp;
+}
+
+/* Recreate SPECS->paths with only those paths that have
+ user_agent_exact_p set to true. */
+
+static void
+prune_non_exact (struct robot_specs *specs)
+{
+ struct path_info *newpaths;
+ int i, j, cnt;
+ cnt = 0;
+ for (i = 0; i < specs->count; i++)
+ if (specs->paths[i].user_agent_exact_p)
+ ++cnt;
+ newpaths = xnew_array (struct path_info, cnt);
+ for (i = 0, j = 0; i < specs->count; i++)
+ if (specs->paths[i].user_agent_exact_p)
+ newpaths[j++] = specs->paths[i];
+ else
+ xfree (specs->paths[i].path);
+ assert (j == cnt);
+ xfree (specs->paths);
+ specs->paths = newpaths;
+ specs->count = cnt;
+ specs->size = cnt;
+}
+
+#define EOL(p) ((p) >= lineend)
+
+#define SKIP_SPACE(p) do { \
+ while (!EOL (p) && c_isspace (*p)) \
+ ++p; \
+} while (0)
+
+#define FIELD_IS(string_literal) \
+ BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
+
+/* Parse textual RES specs beginning with SOURCE of length LENGTH.
+ Return a specs objects ready to be fed to res_match_path.
+
+ The parsing itself is trivial, but creating a correct SPECS object
+ is trickier than it seems, because RES is surprisingly byzantine if
+ you attempt to implement it correctly.
+
+ A "record" is a block of one or more `User-Agent' lines followed by
+ one or more `Allow' or `Disallow' lines. Record is accepted by
+ Wget if one of the `User-Agent' lines was "wget", or if the user
+ agent line was "*".
+
+ After all the lines have been read, we examine whether an exact
+ ("wget") user-agent field was specified. If so, we delete all the
+ lines read under "User-Agent: *" blocks because we have our own
+ Wget-specific blocks. This enables the admin to say:
+
+ User-Agent: *
+ Disallow: /
+
+ User-Agent: google
+ User-Agent: wget
+ Disallow: /cgi-bin
+
+ This means that to Wget and to Google, /cgi-bin is disallowed,
+ whereas for all other crawlers, everything is disallowed.
+ res_parse is implemented so that the order of records doesn't
+ matter. In the case above, the "User-Agent: *" could have come
+ after the other one. */
+
+struct robot_specs *
+res_parse (const char *source, int length)
+{
+ int line_count = 1;
+
+ const char *p = source;
+ const char *end = source + length;
+
+ /* true if last applicable user-agent field matches Wget. */
+ bool user_agent_applies = false;
+
+ /* true if last applicable user-agent field *exactly* matches
+ Wget. */
+ bool user_agent_exact = false;
+
+ /* whether we ever encountered exact user agent. */
+ bool found_exact = false;
+
+ /* count of allow/disallow lines in the current "record", i.e. after
+ the last `user-agent' instructions. */
+ int record_count = 0;
+
+ struct robot_specs *specs = xnew0 (struct robot_specs);
+
+ while (1)
+ {
+ const char *lineend, *lineend_real;
+ const char *field_b, *field_e;
+ const char *value_b, *value_e;
+
+ if (p == end)
+ break;
+ lineend_real = memchr (p, '\n', end - p);
+ if (lineend_real)
+ ++lineend_real;
+ else
+ lineend_real = end;
+ lineend = lineend_real;
+
+ /* Before doing anything else, check whether the line is empty
+ or comment-only. */
+ SKIP_SPACE (p);
+ if (EOL (p) || *p == '#')
+ goto next;
+
+ /* Make sure the end-of-line comments are respected by setting
+ lineend to a location preceding the first comment. Real line
+ ending remains in lineend_real. */
+ for (lineend = p; lineend < lineend_real; lineend++)
+ if ((lineend == p || c_isspace (*(lineend - 1)))
+ && *lineend == '#')
+ break;
+
+ /* Ignore trailing whitespace in the same way. */
+ while (lineend > p && c_isspace (*(lineend - 1)))
+ --lineend;
+
+ assert (!EOL (p));
+
+ field_b = p;
+ while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
+ ++p;
+ field_e = p;
+
+ SKIP_SPACE (p);
+ if (field_b == field_e || EOL (p) || *p != ':')
+ {
+ DEBUGP (("Ignoring malformed line %d\n", line_count));
+ goto next;
+ }
+ ++p; /* skip ':' */
+ SKIP_SPACE (p);
+
+ value_b = p;
+ while (!EOL (p))
+ ++p;
+ value_e = p;
+
+ /* Finally, we have a syntactically valid line. */
+ if (FIELD_IS ("user-agent"))
+ {
+ /* We have to support several cases:
+
+ --previous records--
+
+ User-Agent: foo
+ User-Agent: Wget
+ User-Agent: bar
+ ... matching record ...
+
+ User-Agent: baz
+ User-Agent: qux
+ ... non-matching record ...
+
+ User-Agent: *
+ ... matching record, but will be pruned later ...
+
+ We have to respect `User-Agent' at the beginning of each
+ new record simply because we don't know if we're going to
+ encounter "Wget" among the agents or not. Hence,
+ match_user_agent is called when record_count != 0.
+
+ But if record_count is 0, we have to keep calling it
+ until it matches, and if that happens, we must not call
+ it any more, until the next record. Hence the other part
+ of the condition. */
+ if (record_count != 0 || user_agent_applies == false)
+ match_user_agent (value_b, value_e - value_b,
+ &user_agent_applies, &user_agent_exact);
+ if (user_agent_exact)
+ found_exact = true;
+ record_count = 0;
+ }
+ else if (FIELD_IS ("allow"))
+ {
+ if (user_agent_applies)
+ {
+ add_path (specs, value_b, value_e, true, user_agent_exact);
+ }
+ ++record_count;
+ }
+ else if (FIELD_IS ("disallow"))
+ {
+ if (user_agent_applies)
+ {
+ bool allowed = false;
+ if (value_b == value_e)
+ /* Empty "disallow" line means everything is *allowed*! */
+ allowed = true;
+ add_path (specs, value_b, value_e, allowed, user_agent_exact);
+ }
+ ++record_count;
+ }
+ else
+ {
+ DEBUGP (("Ignoring unknown field at line %d\n", line_count));
+ goto next;
+ }
+
+ next:
+ p = lineend_real;
+ ++line_count;
+ }
+
+ if (found_exact)
+ {
+ /* We've encountered an exactly matching user-agent. Throw out
+ all the stuff with user-agent: *. */
+ prune_non_exact (specs);
+ }
+ else if (specs->size > specs->count)
+ {
+ /* add_path normally over-allocates specs->paths. Reallocate it
+ to the correct size in order to conserve some memory. */
+ specs->paths = xrealloc (specs->paths,
+ specs->count * sizeof (struct path_info));
+ specs->size = specs->count;
+ }
+
+ return specs;
+}
+
+/* The same like res_parse, but first map the FILENAME into memory,
+ and then parse it. */
+
+struct robot_specs *
+res_parse_from_file (const char *filename)
+{
+ struct robot_specs *specs;
+ struct file_memory *fm = wget_read_file (filename);
+ if (!fm)
+ {
+ logprintf (LOG_NOTQUIET, _("Cannot open %s: %s\n"),
+ filename, strerror (errno));
+ return NULL;
+ }
+ specs = res_parse (fm->content, fm->length);
+ wget_read_file_free (fm);
+ return specs;
+}
+
+static void
+free_specs (struct robot_specs *specs)
+{
+ int i;
+ for (i = 0; i < specs->count; i++)
+ xfree (specs->paths[i].path);
+ xfree (specs->paths);
+ xfree (specs);
+}
+
+/* Matching of a path according to the specs. */
+
+/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
+ that number is not a numerical representation of '/', decode C and
+ advance the pointer. */
+
+#define DECODE_MAYBE(c, ptr) do { \
+ if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
+ { \
+ unsigned char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
+ if (decoded != '/') \
+ { \
+ c = decoded; \
+ ptr += 2; \
+ } \
+ } \
+} while (0)
+
+/* The inner matching engine: return true if RECORD_PATH matches
+ URL_PATH. The rules for matching are described at
+ <http://www.robotstxt.org/norobots-rfc.txt>, section 3.2.2. */
+
+static bool
+matches (const char *record_path, const char *url_path)
+{
+ const char *rp = record_path;
+ const char *up = url_path;
+
+ for (; ; ++rp, ++up)
+ {
+ char rc = *rp;
+ char uc = *up;
+ if (!rc)
+ return true;
+ if (!uc)
+ return false;
+ DECODE_MAYBE(rc, rp);
+ DECODE_MAYBE(uc, up);
+ if (rc != uc)
+ return false;
+ }
+}
+
+/* Iterate through all paths in SPECS. For the first one that
+ matches, return its allow/reject status. If none matches,
+ retrieval is by default allowed. */
+
+bool
+res_match_path (const struct robot_specs *specs, const char *path)
+{
+ int i;
+ if (!specs)
+ return true;
+ for (i = 0; i < specs->count; i++)
+ if (matches (specs->paths[i].path, path))
+ {
+ bool allowedp = specs->paths[i].allowedp;
+ DEBUGP (("%s path %s because of rule %s.\n",
+ allowedp ? "Allowing" : "Rejecting",
+ path, quote (specs->paths[i].path)));
+ return allowedp;
+ }
+ return true;
+}
+
+/* Registering the specs. */
+
+static struct hash_table *registered_specs;
+
+/* Register RES specs that below to server on HOST:PORT. They will
+ later be retrievable using res_get_specs. */
+
+void
+res_register_specs (const char *host, int port, struct robot_specs *specs)
+{
+ struct robot_specs *old;
+ char buf[256], *hp, *hp_old;
+
+ if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
+ hp = aprintf("%s:%d", host, port);
+ else
+ hp = buf;
+
+ if (!registered_specs)
+ registered_specs = make_nocase_string_hash_table (0);
+
+ if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
+ {
+ if (hp != buf)
+ xfree (hp);
+ if (old)
+ free_specs (old);
+ hash_table_put (registered_specs, hp_old, specs);
+ }
+ else
+ {
+ hash_table_put (registered_specs, hp == buf ? xstrdup (hp) : hp, specs);
+ }
+}
+
+/* Get the specs that belong to HOST:PORT. */
+
+struct robot_specs *
+res_get_specs (const char *host, int port)
+{
+ char buf[256], *hp;
+
+ if (!registered_specs)
+ return NULL;
+
+ if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
+ hp = aprintf("%s:%d", host, port);
+ else
+ hp = buf;
+
+ return hash_table_get (registered_specs, hp);
+}
+
+/* Loading the robots file. */
+
+#define RES_SPECS_LOCATION "/robots.txt"
+
+/* Retrieve the robots.txt from the server root of the server that
+ serves URL. The file will be named according to the currently
+ active rules, and the file name will be returned in *file.
+
+ Return true if robots were retrieved OK, false otherwise. */
+
+bool
+res_retrieve_file (const char *url, char **file, struct iri *iri)
+{
+ struct iri *i = iri_new ();
+ uerr_t err;
+ char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
+ int saved_ts_val = opt.timestamping;
+ int saved_sp_val = opt.spider, url_err;
+ struct url * url_parsed;
+
+ /* Copy server URI encoding for a possible IDNA transformation, no need to
+ encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+ set_uri_encoding (i, iri->uri_encoding, false);
+ i->utf8_encode = false;
+
+ logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
+ *file = NULL;
+ opt.timestamping = false;
+ opt.spider = false;
+
+ url_parsed = url_parse (robots_url, &url_err, i, true);
+ if (!url_parsed)
+ {
+ char *error = url_error (robots_url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
+ xfree (error);
+ err = URLERROR;
+ }
+ else
+ {
+ err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
+ false, i, false);
+ url_free(url_parsed);
+ }
+
+ opt.timestamping = saved_ts_val;
+ opt.spider = saved_sp_val;
+ xfree (robots_url);
+ iri_free (i);
+
+ if (err != RETROK && *file != NULL)
+ {
+ /* If the file is not retrieved correctly, but retrieve_url
+ allocated the file name, deallocate is here so that the
+ caller doesn't have to worry about it. */
+ xfree (*file);
+ }
+ return err == RETROK;
+}
+
+bool
+is_robots_txt_url (const char *url)
+{
+ char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
+ bool ret = are_urls_equal (url, robots_url);
+
+ xfree (robots_url);
+
+ return ret;
+}
+
+#if defined DEBUG_MALLOC || defined TESTING
+void
+res_cleanup (void)
+{
+ if (registered_specs)
+ {
+ hash_table_iterator iter;
+ for (hash_table_iterate (registered_specs, &iter);
+ hash_table_iter_next (&iter);
+ )
+ {
+ xfree (iter.key);
+ free_specs (iter.value);
+ }
+ hash_table_destroy (registered_specs);
+ registered_specs = NULL;
+ }
+}
+#endif
+
+#ifdef TESTING
+
+const char *
+test_is_robots_txt_url(void)
+{
+ unsigned i;
+ static const struct {
+ const char *url;
+ bool expected_result;
+ } test_array[] = {
+ { "http://www.yoyodyne.com/robots.txt", true },
+ { "http://www.yoyodyne.com/somepath/", false },
+ { "http://www.yoyodyne.com/somepath/robots.txt", false },
+ };
+
+ for (i = 0; i < countof(test_array); ++i)
+ {
+ mu_assert ("test_is_robots_txt_url: wrong result",
+ is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
+ }
+
+ return NULL;
+}
+
+#endif /* TESTING */
+
+/*
+ * vim: et ts=2 sw=2
+ */