summaryrefslogtreecommitdiffstats
path: root/src/is_csv.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/is_csv.c196
1 files changed, 196 insertions, 0 deletions
diff --git a/src/is_csv.c b/src/is_csv.c
new file mode 100644
index 0000000..597d775
--- /dev/null
+++ b/src/is_csv.c
@@ -0,0 +1,196 @@
+/*-
+ * Copyright (c) 2019 Christos Zoulas
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Parse CSV object serialization format (RFC-4180, RFC-7111)
+ */
+
+#ifndef TEST
+#include "file.h"
+
+#ifndef lint
+FILE_RCSID("@(#)$File: is_csv.c,v 1.10 2022/09/24 20:30:13 christos Exp $")
+#endif
+
+#include <string.h>
+#include "magic.h"
+#else
+#include <sys/types.h>
+#endif
+
+
+#ifdef DEBUG
+#include <stdio.h>
+#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
+#else
+#define DPRINTF(fmt, ...)
+#endif
+
+/*
+ * if CSV_LINES == 0:
+ * check all the lines in the buffer
+ * otherwise:
+ * check only up-to the number of lines specified
+ *
+ * the last line count is always ignored if it does not end in CRLF
+ */
+#ifndef CSV_LINES
+#define CSV_LINES 10
+#endif
+
+static int csv_parse(const unsigned char *, const unsigned char *);
+
+static const unsigned char *
+eatquote(const unsigned char *uc, const unsigned char *ue)
+{
+ int quote = 0;
+
+ while (uc < ue) {
+ unsigned char c = *uc++;
+ if (c != '"') {
+ // We already got one, done.
+ if (quote) {
+ return --uc;
+ }
+ continue;
+ }
+ if (quote) {
+ // quote-quote escapes
+ quote = 0;
+ continue;
+ }
+ // first quote
+ quote = 1;
+ }
+ return ue;
+}
+
+static int
+csv_parse(const unsigned char *uc, const unsigned char *ue)
+{
+ size_t nf = 0, tf = 0, nl = 0;
+
+ while (uc < ue) {
+ switch (*uc++) {
+ case '"':
+ // Eat until the matching quote
+ uc = eatquote(uc, ue);
+ break;
+ case ',':
+ nf++;
+ break;
+ case '\n':
+ DPRINTF("%zu %zu %zu\n", nl, nf, tf);
+ nl++;
+#if CSV_LINES
+ if (nl == CSV_LINES)
+ return tf != 0 && tf == nf;
+#endif
+ if (tf == 0) {
+ // First time and no fields, give up
+ if (nf == 0)
+ return 0;
+ // First time, set the number of fields
+ tf = nf;
+ } else if (tf != nf) {
+ // Field number mismatch, we are done.
+ return 0;
+ }
+ nf = 0;
+ break;
+ default:
+ break;
+ }
+ }
+ return tf && nl > 2;
+}
+
+#ifndef TEST
+int
+file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
+{
+ const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
+ const unsigned char *ue = uc + b->flen;
+ int mime = ms->flags & MAGIC_MIME;
+
+ if (!looks_text)
+ return 0;
+
+ if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
+ return 0;
+
+ if (!csv_parse(uc, ue))
+ return 0;
+
+ if (mime == MAGIC_MIME_ENCODING)
+ return 1;
+
+ if (mime) {
+ if (file_printf(ms, "text/csv") == -1)
+ return -1;
+ return 1;
+ }
+
+ if (file_printf(ms, "CSV text") == -1)
+ return -1;
+
+ return 1;
+}
+
+#else
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <err.h>
+
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ struct stat st;
+ unsigned char *p;
+
+ if ((fd = open(argv[1], O_RDONLY)) == -1)
+ err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
+
+ if (fstat(fd, &st) == -1)
+ err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
+
+ if ((p = CAST(char *, malloc(st.st_size))) == NULL)
+ err(EXIT_FAILURE, "Can't allocate %jd bytes",
+ (intmax_t)st.st_size);
+ if (read(fd, p, st.st_size) != st.st_size)
+ err(EXIT_FAILURE, "Can't read %jd bytes",
+ (intmax_t)st.st_size);
+ printf("is csv %d\n", csv_parse(p, p + st.st_size));
+ return 0;
+}
+#endif