1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
#!/bin/bash
## this script resides in [galsl/fedora_38:tpcds_v2] docker container.
## the container uses the following repo [ https://github.com/galsalomon66/tpc-ds-datagen-to-aws-s3 ] for the dsdgen application.
## the purpose of this script it to launch multiple instances of the dsdgen-application(depends on number of cores)
## the flow splits between the very-big-tables and the small tables.
## the num_of_cpu defines the size of parallelism, the num_of_partitions defines the amount chunks that combines togather a single table (it could be huge).
## each cycle of parallel generate-application, ended with flow that uploads the generated files into S3(its done in parallel), upon all files are uploaded
## it removes all files, i.e. for 3TB scale there is no need for 3TB of disk-space (as for S3-storage capacity it obvious ...)
## TODO set by te user
TPCDS_DIR=/tpcds_output/
all_tables="call_center
catalog_page
customer_address
customer
customer_demographics
date_dim
household_demographics
income_band
item
promotion
reason
ship_mode
store
time_dim
warehouse
web_page
web_site
catalog_returns
catalog_sales
web_returns
web_sales
store_returns
store_sales"
#big tables and also parent
#parent table means it got a child table, i.e. there is a relation between them.
parent_tables="store_sales catalog_sales web_sales inventory"
## not a parent table
standalone_tables="call_center catalog_page customer_address customer customer_demographics date_dim household_demographics income_band
item promotion reason ship_mode store time_dim warehouse web_page web_site"
#small_tables=""
num_of_cpu=56
num_of_partitions=0
create_dsdgen_workers_non_parent_tables()
{
[ ! -d ${TPCDS_DIR} ] && echo ${TPCDS_DIR} not exist && exit
num_of_partitions=$(echo 1 | awk -v sc=${SCALE} -v c=${num_of_cpu} '{print int((sc/1000)*c);}')
if [ $num_of_partitions -le 1 ]
then
num_of_partitions=2
fi
echo "small tables="num_of_partitions=${num_of_partitions}
((i=1))
for t in ${standalone_tables}
do
for c in $(seq 1 ${num_of_partitions})
do
## the command line defines which table, what scale(size), paratition size, what partition to produce and where to produce it.
echo "time ./dsdgen -dir ${TPCDS_DIR} -table ${t} -scale ${SCALE} -force -parallel ${num_of_partitions} -child ${c} &" >> generate_upload_and_remove_exec.bash
## number of CPU
if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
then
echo wait >> generate_upload_and_remove_exec.bash
# upon complete with wait, loop on generated dat files, upload each in parallel, each upload is done, remove file
# upload && remove
#
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
fi
done
done
echo wait >> generate_upload_and_remove_exec.bash
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
echo "echo small tables done." >> generate_upload_and_remove_exec.bash
chmod +x generate_upload_and_remove_exec.bash
}
create_dsdgen_workers()
{
[ ! -d ${TPCDS_DIR} ] && echo ${TPCDS_DIR} not exist && exit
num_of_partitions=$(echo 1 | awk -v sc=${SCALE} -v c=${num_of_cpu} '{print int((sc/10)*c);}')
echo "big tables="num_of_partitions=${num_of_partitions}
if [ $num_of_partitions -le 1 ]
then
num_of_partitions=2
fi
((i=1))
touch generate_upload_and_remove_exec.bash
rm -f generate_upload_and_remove_exec.bash
echo "#!/bin/bash" >> generate_upload_and_remove_exec.bash
## upload_and_remove_func.bash include functions for upload and remove
echo ". generate_upload_and_remove_infra.bash" >> generate_upload_and_remove_exec.bash
echo "cd /tpc-ds-datagen-to-aws-s3/tpc-ds/v2.11.0rc2/tools" >> generate_upload_and_remove_exec.bash
for t in ${parent_tables}
do
for c in $(seq 1 ${num_of_partitions})
do
echo "time ./dsdgen -dir ${TPCDS_DIR} -table ${t} -scale ${SCALE} -force -parallel ${num_of_partitions} -child ${c} &" >> generate_upload_and_remove_exec.bash
## number of CPU
if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
then
echo wait >> generate_upload_and_remove_exec.bash
# upon complete with wait, loop on generated dat files, upload each in parallel, each upload is done, remove file
# upload && remove
#
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
fi
done
done
echo wait >> generate_upload_and_remove_exec.bash
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
echo "echo big tables done." >> generate_upload_and_remove_exec.bash
## adding the production of the other tables
create_dsdgen_workers_non_parent_tables
chmod +x generate_upload_and_remove_exec.bash
## the generated script bellow contains all is needed for creating TPCDS tables in S3-storage.
## should execute by the user
#./generate_upload_and_remove_exec.bash
}
upload_and_remove_worker_func()
{
# create list of tasks to run in background, remove each uploaded file upon completion
(i=0)
touch upload_and_remove_exec.bash
rm -f upload_and_remove_exec.bash
echo "#!/bin/bash" >> upload_and_remove_exec.bash
for f in $(ls ${TPCDS_DIR}/*.dat)
do
#echo $f
table_name=$(basename $f | sed 's/_[0-9]\+_[0-9]\+/ /' | awk '{print $1;}')
echo "(aws s3api put-object --bucket hive --key scale_${SCALE}/${table_name}/$(basename $f) --body ${f} --endpoint-url ${S3_ENDPOINT} > /dev/null 2>&1 && echo upload ${f} && rm -f ${f}) &" >> upload_and_remove_exec.bash
if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
then
echo wait >> upload_and_remove_exec.bash
fi
done
echo wait >> upload_and_remove_exec.bash
#upload and remove all generated files
chmod +x upload_and_remove_exec.bash
cp upload_and_remove_exec.bash upload_and_remove.bash_${RANDOM} ## debug
## start upload and remove in parallel
./upload_and_remove_exec.bash
}
|