summaryrefslogtreecommitdiffstats
path: root/qa/workunits/hadoop
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /qa/workunits/hadoop
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rwxr-xr-xqa/workunits/hadoop/repl.sh42
-rwxr-xr-xqa/workunits/hadoop/terasort.sh76
-rwxr-xr-xqa/workunits/hadoop/wordcount.sh35
3 files changed, 153 insertions, 0 deletions
diff --git a/qa/workunits/hadoop/repl.sh b/qa/workunits/hadoop/repl.sh
new file mode 100755
index 000000000..84f6150ab
--- /dev/null
+++ b/qa/workunits/hadoop/repl.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+# bail if $TESTDIR is not set as this test will fail in that scenario
+[ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; }
+
+# if HADOOP_PREFIX is not set, use default
+[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/hadoop; }
+
+# create pools with different replication factors
+for repl in 2 3 7 8 9; do
+ name=hadoop.$repl
+ ceph osd pool create $name 8 8
+ ceph osd pool set $name size $repl
+
+ id=`ceph osd dump | sed -n "s/^pool \([0-9]*\) '$name'.*/\1/p"`
+ ceph fs add_data_pool cephfs $id
+done
+
+# create a file in each of the pools
+for repl in 2 3 7 8 9; do
+ name=hadoop.$repl
+ $HADOOP_PREFIX/bin/hadoop fs -rm -f /$name.dat
+ dd if=/dev/zero bs=1048576 count=1 | \
+ $HADOOP_PREFIX/bin/hadoop fs -Dceph.data.pools="$name" \
+ -put - /$name.dat
+done
+
+# check that hadoop reports replication matching
+# that of the pool the file was written into
+for repl in 2 3 7 8 9; do
+ name=hadoop.$repl
+ repl2=$($HADOOP_PREFIX/bin/hadoop fs -ls /$name.dat | awk '{print $2}')
+ if [ $repl -ne $repl2 ]; then
+ echo "replication factors didn't match!"
+ exit 1
+ fi
+done
+
+exit 0
diff --git a/qa/workunits/hadoop/terasort.sh b/qa/workunits/hadoop/terasort.sh
new file mode 100755
index 000000000..3d6988a21
--- /dev/null
+++ b/qa/workunits/hadoop/terasort.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+INPUT=/terasort-input
+OUTPUT=/terasort-output
+REPORT=/tersort-report
+
+num_records=100000
+[ ! -z $NUM_RECORDS ] && num_records=$NUM_RECORDS
+
+# bail if $TESTDIR is not set as this test will fail in that scenario
+[ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; }
+
+# if HADOOP_PREFIX is not set, use default
+[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/hadoop; }
+
+# Nuke hadoop directories
+$HADOOP_PREFIX/bin/hadoop fs -rm -r $INPUT $OUTPUT $REPORT || true
+
+# Generate terasort data
+#
+#-Ddfs.blocksize=512M \
+#-Dio.file.buffer.size=131072 \
+#-Dmapreduce.map.java.opts=-Xmx1536m \
+#-Dmapreduce.map.memory.mb=2048 \
+#-Dmapreduce.task.io.sort.mb=256 \
+#-Dyarn.app.mapreduce.am.resource.mb=1024 \
+#-Dmapred.map.tasks=64 \
+$HADOOP_PREFIX/bin/hadoop jar \
+ $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+ teragen \
+ -Dmapred.map.tasks=9 \
+ $num_records \
+ $INPUT
+
+# Run the sort job
+#
+#-Ddfs.blocksize=512M \
+#-Dio.file.buffer.size=131072 \
+#-Dmapreduce.map.java.opts=-Xmx1536m \
+#-Dmapreduce.map.memory.mb=2048 \
+#-Dmapreduce.map.output.compress=true \
+#-Dmapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.Lz4Codec \
+#-Dmapreduce.reduce.java.opts=-Xmx1536m \
+#-Dmapreduce.reduce.memory.mb=2048 \
+#-Dmapreduce.task.io.sort.factor=100 \
+#-Dmapreduce.task.io.sort.mb=768 \
+#-Dyarn.app.mapreduce.am.resource.mb=1024 \
+#-Dmapred.reduce.tasks=100 \
+#-Dmapreduce.terasort.output.replication=1 \
+$HADOOP_PREFIX/bin/hadoop jar \
+ $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+ terasort \
+ -Dmapred.reduce.tasks=10 \
+ $INPUT $OUTPUT
+
+# Validate the sorted data
+#
+#-Ddfs.blocksize=512M \
+#-Dio.file.buffer.size=131072 \
+#-Dmapreduce.map.java.opts=-Xmx1536m \
+#-Dmapreduce.map.memory.mb=2048 \
+#-Dmapreduce.reduce.java.opts=-Xmx1536m \
+#-Dmapreduce.reduce.memory.mb=2048 \
+#-Dmapreduce.task.io.sort.mb=256 \
+#-Dyarn.app.mapreduce.am.resource.mb=1024 \
+#-Dmapred.reduce.tasks=1 \
+$HADOOP_PREFIX/bin/hadoop jar \
+ $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+ teravalidate \
+ -Dmapred.reduce.tasks=1 \
+ $OUTPUT $REPORT
+
+exit 0
diff --git a/qa/workunits/hadoop/wordcount.sh b/qa/workunits/hadoop/wordcount.sh
new file mode 100755
index 000000000..616b08af2
--- /dev/null
+++ b/qa/workunits/hadoop/wordcount.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+WC_INPUT=/wc_input
+WC_OUTPUT=/wc_output
+DATA_INPUT=$(mktemp -d)
+
+echo "starting hadoop-wordcount test"
+
+# bail if $TESTDIR is not set as this test will fail in that scenario
+[ -z $TESTDIR ] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; }
+
+# if HADOOP_PREFIX is not set, use default
+[ -z $HADOOP_PREFIX ] && { HADOOP_PREFIX=$TESTDIR/hadoop; }
+
+# Nuke hadoop directories
+$HADOOP_PREFIX/bin/hadoop fs -rm -r $WC_INPUT $WC_OUTPUT || true
+
+# Fetch and import testing data set
+curl http://download.ceph.com/qa/hadoop_input_files.tar | tar xf - -C $DATA_INPUT
+$HADOOP_PREFIX/bin/hadoop fs -copyFromLocal $DATA_INPUT $WC_INPUT
+rm -rf $DATA_INPUT
+
+# Run the job
+$HADOOP_PREFIX/bin/hadoop jar \
+ $HADOOP_PREFIX/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+ wordcount $WC_INPUT $WC_OUTPUT
+
+# Cleanup
+$HADOOP_PREFIX/bin/hadoop fs -rm -r $WC_INPUT $WC_OUTPUT || true
+
+echo "completed hadoop-wordcount test"
+exit 0