summaryrefslogtreecommitdiffstats
path: root/src/arrow/go/arrow/csv/reader_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/go/arrow/csv/reader_test.go')
-rw-r--r--src/arrow/go/arrow/csv/reader_test.go604
1 files changed, 604 insertions, 0 deletions
diff --git a/src/arrow/go/arrow/csv/reader_test.go b/src/arrow/go/arrow/csv/reader_test.go
new file mode 100644
index 000000000..c2c94f6b6
--- /dev/null
+++ b/src/arrow/go/arrow/csv/reader_test.go
@@ -0,0 +1,604 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package csv_test
+
+import (
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "testing"
+
+ "github.com/apache/arrow/go/v6/arrow"
+ "github.com/apache/arrow/go/v6/arrow/csv"
+ "github.com/apache/arrow/go/v6/arrow/memory"
+)
+
+func Example() {
+ f := bytes.NewBufferString(`## a simple set of data: int64;float64;string
+0;0;str-0
+1;1;str-1
+2;2;str-2
+3;3;str-3
+4;4;str-4
+5;5;str-5
+6;6;str-6
+7;7;str-7
+8;8;str-8
+9;9;str-9
+`)
+
+ schema := arrow.NewSchema(
+ []arrow.Field{
+ {Name: "i64", Type: arrow.PrimitiveTypes.Int64},
+ {Name: "f64", Type: arrow.PrimitiveTypes.Float64},
+ {Name: "str", Type: arrow.BinaryTypes.String},
+ },
+ nil,
+ )
+ r := csv.NewReader(f, schema, csv.WithComment('#'), csv.WithComma(';'))
+ defer r.Release()
+
+ n := 0
+ for r.Next() {
+ rec := r.Record()
+ for i, col := range rec.Columns() {
+ fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
+ }
+ n++
+ }
+
+ // Output:
+ // rec[0]["i64"]: [0]
+ // rec[0]["f64"]: [0]
+ // rec[0]["str"]: ["str-0"]
+ // rec[1]["i64"]: [1]
+ // rec[1]["f64"]: [1]
+ // rec[1]["str"]: ["str-1"]
+ // rec[2]["i64"]: [2]
+ // rec[2]["f64"]: [2]
+ // rec[2]["str"]: ["str-2"]
+ // rec[3]["i64"]: [3]
+ // rec[3]["f64"]: [3]
+ // rec[3]["str"]: ["str-3"]
+ // rec[4]["i64"]: [4]
+ // rec[4]["f64"]: [4]
+ // rec[4]["str"]: ["str-4"]
+ // rec[5]["i64"]: [5]
+ // rec[5]["f64"]: [5]
+ // rec[5]["str"]: ["str-5"]
+ // rec[6]["i64"]: [6]
+ // rec[6]["f64"]: [6]
+ // rec[6]["str"]: ["str-6"]
+ // rec[7]["i64"]: [7]
+ // rec[7]["f64"]: [7]
+ // rec[7]["str"]: ["str-7"]
+ // rec[8]["i64"]: [8]
+ // rec[8]["f64"]: [8]
+ // rec[8]["str"]: ["str-8"]
+ // rec[9]["i64"]: [9]
+ // rec[9]["f64"]: [9]
+ // rec[9]["str"]: ["str-9"]
+}
+
+func Example_withChunk() {
+ f := bytes.NewBufferString(`## a simple set of data: int64;float64;string
+0;0;str-0
+1;1;str-1
+2;2;str-2
+3;3;str-3
+4;4;str-4
+5;5;str-5
+6;6;str-6
+7;7;str-7
+8;8;str-8
+9;9;str-9
+`)
+
+ schema := arrow.NewSchema(
+ []arrow.Field{
+ {Name: "i64", Type: arrow.PrimitiveTypes.Int64},
+ {Name: "f64", Type: arrow.PrimitiveTypes.Float64},
+ {Name: "str", Type: arrow.BinaryTypes.String},
+ },
+ nil,
+ )
+ r := csv.NewReader(
+ f, schema,
+ csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(3),
+ )
+ defer r.Release()
+
+ n := 0
+ for r.Next() {
+ rec := r.Record()
+ for i, col := range rec.Columns() {
+ fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
+ }
+ n++
+ }
+
+ // Output:
+ // rec[0]["i64"]: [0 1 2]
+ // rec[0]["f64"]: [0 1 2]
+ // rec[0]["str"]: ["str-0" "str-1" "str-2"]
+ // rec[1]["i64"]: [3 4 5]
+ // rec[1]["f64"]: [3 4 5]
+ // rec[1]["str"]: ["str-3" "str-4" "str-5"]
+ // rec[2]["i64"]: [6 7 8]
+ // rec[2]["f64"]: [6 7 8]
+ // rec[2]["str"]: ["str-6" "str-7" "str-8"]
+ // rec[3]["i64"]: [9]
+ // rec[3]["f64"]: [9]
+ // rec[3]["str"]: ["str-9"]
+}
+
+func TestCSVReader(t *testing.T) {
+ tests := []struct {
+ Name string
+ File string
+ Header bool
+ }{{
+ Name: "NoHeader",
+ File: "testdata/types.csv",
+ Header: false,
+ }, {
+ Name: "Header",
+ File: "testdata/header.csv",
+ Header: true,
+ }}
+ for _, test := range tests {
+ t.Run(test.Name, func(t *testing.T) {
+ testCSVReader(t, test.File, test.Header)
+ })
+ }
+}
+
+var defaultNullValues = []string{"", "NULL", "null", "N/A"}
+
+func testCSVReader(t *testing.T, filepath string, withHeader bool) {
+ mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer mem.AssertSize(t, 0)
+
+ raw, err := ioutil.ReadFile(filepath)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ schema := arrow.NewSchema(
+ []arrow.Field{
+ arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean},
+ arrow.Field{Name: "i8", Type: arrow.PrimitiveTypes.Int8},
+ arrow.Field{Name: "i16", Type: arrow.PrimitiveTypes.Int16},
+ arrow.Field{Name: "i32", Type: arrow.PrimitiveTypes.Int32},
+ arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
+ arrow.Field{Name: "u8", Type: arrow.PrimitiveTypes.Uint8},
+ arrow.Field{Name: "u16", Type: arrow.PrimitiveTypes.Uint16},
+ arrow.Field{Name: "u32", Type: arrow.PrimitiveTypes.Uint32},
+ arrow.Field{Name: "u64", Type: arrow.PrimitiveTypes.Uint64},
+ arrow.Field{Name: "f32", Type: arrow.PrimitiveTypes.Float32},
+ arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
+ arrow.Field{Name: "str", Type: arrow.BinaryTypes.String},
+ },
+ nil,
+ )
+ r := csv.NewReader(bytes.NewReader(raw), schema,
+ csv.WithAllocator(mem),
+ csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithHeader(withHeader),
+ csv.WithNullReader(true, defaultNullValues...),
+ )
+ defer r.Release()
+
+ r.Retain()
+ r.Release()
+
+ if got, want := r.Schema(), schema; !got.Equal(want) {
+ t.Fatalf("invalid schema: got=%v, want=%v", got, want)
+ }
+
+ out := new(bytes.Buffer)
+
+ n := 0
+ for r.Next() {
+ rec := r.Record()
+ for i, col := range rec.Columns() {
+ fmt.Fprintf(out, "rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
+ }
+ n++
+ }
+
+ if got, want := n, 3; got != want {
+ t.Fatalf("invalid number of rows: got=%d, want=%d", got, want)
+ }
+
+ want := `rec[0]["bool"]: [true]
+rec[0]["i8"]: [-1]
+rec[0]["i16"]: [-1]
+rec[0]["i32"]: [-1]
+rec[0]["i64"]: [-1]
+rec[0]["u8"]: [1]
+rec[0]["u16"]: [1]
+rec[0]["u32"]: [1]
+rec[0]["u64"]: [1]
+rec[0]["f32"]: [1.1]
+rec[0]["f64"]: [1.1]
+rec[0]["str"]: ["str-1"]
+rec[1]["bool"]: [false]
+rec[1]["i8"]: [-2]
+rec[1]["i16"]: [-2]
+rec[1]["i32"]: [-2]
+rec[1]["i64"]: [-2]
+rec[1]["u8"]: [2]
+rec[1]["u16"]: [2]
+rec[1]["u32"]: [2]
+rec[1]["u64"]: [2]
+rec[1]["f32"]: [2.2]
+rec[1]["f64"]: [2.2]
+rec[1]["str"]: ["str-2"]
+rec[2]["bool"]: [(null)]
+rec[2]["i8"]: [(null)]
+rec[2]["i16"]: [(null)]
+rec[2]["i32"]: [(null)]
+rec[2]["i64"]: [(null)]
+rec[2]["u8"]: [(null)]
+rec[2]["u16"]: [(null)]
+rec[2]["u32"]: [(null)]
+rec[2]["u64"]: [(null)]
+rec[2]["f32"]: [(null)]
+rec[2]["f64"]: [(null)]
+rec[2]["str"]: [(null)]
+`
+
+ if got, want := out.String(), want; got != want {
+ t.Fatalf("invalid output:\ngot= %s\nwant=%s\n", got, want)
+ }
+
+ if r.Err() != nil {
+ t.Fatalf("unexpected error: %v", r.Err())
+ }
+
+ // test error modes
+ {
+ r := csv.NewReader(bytes.NewReader(raw), schema,
+ csv.WithAllocator(mem),
+ csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithHeader(withHeader),
+ csv.WithNullReader(true),
+ )
+
+ r.Next()
+ r.Record()
+
+ r.Release()
+ }
+}
+
+func TestCSVReaderWithChunk(t *testing.T) {
+ mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer mem.AssertSize(t, 0)
+
+ raw, err := ioutil.ReadFile("testdata/simple.csv")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ schema := arrow.NewSchema(
+ []arrow.Field{
+ arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64},
+ arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64},
+ arrow.Field{Name: "str", Type: arrow.BinaryTypes.String},
+ },
+ nil,
+ )
+
+ for _, tc := range []struct {
+ name string
+ opts []csv.Option
+ records int
+ want string
+ }{
+ {
+ name: "chunk=default",
+ opts: []csv.Option{csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';')},
+ records: 10,
+ want: `rec[0]["i64"]: [0]
+rec[0]["f64"]: [0]
+rec[0]["str"]: ["str-0"]
+rec[1]["i64"]: [1]
+rec[1]["f64"]: [1]
+rec[1]["str"]: ["str-1"]
+rec[2]["i64"]: [2]
+rec[2]["f64"]: [2]
+rec[2]["str"]: ["str-2"]
+rec[3]["i64"]: [3]
+rec[3]["f64"]: [3]
+rec[3]["str"]: ["str-3"]
+rec[4]["i64"]: [4]
+rec[4]["f64"]: [4]
+rec[4]["str"]: ["str-4"]
+rec[5]["i64"]: [5]
+rec[5]["f64"]: [5]
+rec[5]["str"]: ["str-5"]
+rec[6]["i64"]: [6]
+rec[6]["f64"]: [6]
+rec[6]["str"]: ["str-6"]
+rec[7]["i64"]: [7]
+rec[7]["f64"]: [7]
+rec[7]["str"]: ["str-7"]
+rec[8]["i64"]: [8]
+rec[8]["f64"]: [8]
+rec[8]["str"]: ["str-8"]
+rec[9]["i64"]: [9]
+rec[9]["f64"]: [9]
+rec[9]["str"]: ["str-9"]
+`,
+ },
+ {
+ name: "chunk=0",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(0),
+ },
+ records: 10,
+ want: `rec[0]["i64"]: [0]
+rec[0]["f64"]: [0]
+rec[0]["str"]: ["str-0"]
+rec[1]["i64"]: [1]
+rec[1]["f64"]: [1]
+rec[1]["str"]: ["str-1"]
+rec[2]["i64"]: [2]
+rec[2]["f64"]: [2]
+rec[2]["str"]: ["str-2"]
+rec[3]["i64"]: [3]
+rec[3]["f64"]: [3]
+rec[3]["str"]: ["str-3"]
+rec[4]["i64"]: [4]
+rec[4]["f64"]: [4]
+rec[4]["str"]: ["str-4"]
+rec[5]["i64"]: [5]
+rec[5]["f64"]: [5]
+rec[5]["str"]: ["str-5"]
+rec[6]["i64"]: [6]
+rec[6]["f64"]: [6]
+rec[6]["str"]: ["str-6"]
+rec[7]["i64"]: [7]
+rec[7]["f64"]: [7]
+rec[7]["str"]: ["str-7"]
+rec[8]["i64"]: [8]
+rec[8]["f64"]: [8]
+rec[8]["str"]: ["str-8"]
+rec[9]["i64"]: [9]
+rec[9]["f64"]: [9]
+rec[9]["str"]: ["str-9"]
+`,
+ },
+ {
+ name: "chunk=1",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(1),
+ },
+ records: 10,
+ want: `rec[0]["i64"]: [0]
+rec[0]["f64"]: [0]
+rec[0]["str"]: ["str-0"]
+rec[1]["i64"]: [1]
+rec[1]["f64"]: [1]
+rec[1]["str"]: ["str-1"]
+rec[2]["i64"]: [2]
+rec[2]["f64"]: [2]
+rec[2]["str"]: ["str-2"]
+rec[3]["i64"]: [3]
+rec[3]["f64"]: [3]
+rec[3]["str"]: ["str-3"]
+rec[4]["i64"]: [4]
+rec[4]["f64"]: [4]
+rec[4]["str"]: ["str-4"]
+rec[5]["i64"]: [5]
+rec[5]["f64"]: [5]
+rec[5]["str"]: ["str-5"]
+rec[6]["i64"]: [6]
+rec[6]["f64"]: [6]
+rec[6]["str"]: ["str-6"]
+rec[7]["i64"]: [7]
+rec[7]["f64"]: [7]
+rec[7]["str"]: ["str-7"]
+rec[8]["i64"]: [8]
+rec[8]["f64"]: [8]
+rec[8]["str"]: ["str-8"]
+rec[9]["i64"]: [9]
+rec[9]["f64"]: [9]
+rec[9]["str"]: ["str-9"]
+`,
+ },
+ {
+ name: "chunk=3",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(3),
+ },
+ records: 4,
+ want: `rec[0]["i64"]: [0 1 2]
+rec[0]["f64"]: [0 1 2]
+rec[0]["str"]: ["str-0" "str-1" "str-2"]
+rec[1]["i64"]: [3 4 5]
+rec[1]["f64"]: [3 4 5]
+rec[1]["str"]: ["str-3" "str-4" "str-5"]
+rec[2]["i64"]: [6 7 8]
+rec[2]["f64"]: [6 7 8]
+rec[2]["str"]: ["str-6" "str-7" "str-8"]
+rec[3]["i64"]: [9]
+rec[3]["f64"]: [9]
+rec[3]["str"]: ["str-9"]
+`,
+ },
+ {
+ name: "chunk=6",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(6),
+ },
+ records: 2,
+ want: `rec[0]["i64"]: [0 1 2 3 4 5]
+rec[0]["f64"]: [0 1 2 3 4 5]
+rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5"]
+rec[1]["i64"]: [6 7 8 9]
+rec[1]["f64"]: [6 7 8 9]
+rec[1]["str"]: ["str-6" "str-7" "str-8" "str-9"]
+`,
+ },
+ {
+ name: "chunk=10",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(10),
+ },
+ records: 1,
+ want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9]
+rec[0]["f64"]: [0 1 2 3 4 5 6 7 8 9]
+rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"]
+`,
+ },
+ {
+ name: "chunk=11",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(11),
+ },
+ records: 1,
+ want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9]
+rec[0]["f64"]: [0 1 2 3 4 5 6 7 8 9]
+rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"]
+`,
+ },
+ {
+ name: "chunk=-1",
+ opts: []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(-1),
+ },
+ records: 1,
+ want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9]
+rec[0]["f64"]: [0 1 2 3 4 5 6 7 8 9]
+rec[0]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"]
+`,
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ r := csv.NewReader(bytes.NewReader(raw), schema, tc.opts...)
+
+ defer r.Release()
+
+ r.Retain()
+ r.Release()
+
+ if got, want := r.Schema(), schema; !got.Equal(want) {
+ t.Fatalf("invalid schema: got=%v, want=%v", got, want)
+ }
+
+ out := new(bytes.Buffer)
+
+ n := 0
+ for r.Next() {
+ rec := r.Record()
+ for i, col := range rec.Columns() {
+ fmt.Fprintf(out, "rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
+ }
+ n++
+ }
+
+ if got, want := n, tc.records; got != want {
+ t.Fatalf("invalid number of records: got=%d, want=%d", got, want)
+ }
+
+ if got, want := out.String(), tc.want; got != want {
+ t.Fatalf("invalid output:\ngot:\n%s\nwant:\n%s\n", got, want)
+ }
+
+ if r.Err() != nil {
+ t.Fatalf("unexpected error: %v", r.Err())
+ }
+ })
+ }
+}
+
+func BenchmarkRead(b *testing.B) {
+ gen := func(rows, cols int) []byte {
+ buf := new(bytes.Buffer)
+ for i := 0; i < rows; i++ {
+ for j := 0; j < cols; j++ {
+ if j > 0 {
+ fmt.Fprintf(buf, ";")
+ }
+ fmt.Fprintf(buf, "%d;%f;str-%d", i, float64(i), i)
+ }
+ fmt.Fprintf(buf, "\n")
+ }
+ return buf.Bytes()
+ }
+
+ for _, rows := range []int{10, 1e2, 1e3, 1e4, 1e5} {
+ for _, cols := range []int{1, 10, 100, 1000} {
+ raw := gen(rows, cols)
+ for _, chunks := range []int{-1, 0, 10, 100, 1000} {
+ b.Run(fmt.Sprintf("rows=%d cols=%d chunks=%d", rows, cols, chunks), func(b *testing.B) {
+ benchRead(b, raw, rows, cols, chunks)
+ })
+ }
+ }
+ }
+}
+
+func benchRead(b *testing.B, raw []byte, rows, cols, chunks int) {
+ mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer mem.AssertSize(b, 0)
+
+ var fields []arrow.Field
+ for i := 0; i < cols; i++ {
+ fields = append(fields, []arrow.Field{
+ arrow.Field{Name: fmt.Sprintf("i64-%d", i), Type: arrow.PrimitiveTypes.Int64},
+ arrow.Field{Name: fmt.Sprintf("f64-%d", i), Type: arrow.PrimitiveTypes.Float64},
+ arrow.Field{Name: fmt.Sprintf("str-%d", i), Type: arrow.BinaryTypes.String},
+ }...)
+ }
+
+ schema := arrow.NewSchema(fields, nil)
+ chunk := 0
+ if chunks != 0 {
+ chunk = rows / chunks
+ }
+ opts := []csv.Option{
+ csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'),
+ csv.WithChunk(chunk),
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ r := csv.NewReader(bytes.NewReader(raw), schema, opts...)
+
+ n := int64(0)
+ for r.Next() {
+ n += r.Record().NumRows()
+ }
+
+ r.Release()
+ if n != int64(rows) {
+ b.Fatalf("invalid number of rows. want=%d, got=%d", n, rows)
+ }
+ }
+}