src/s3select/TPCDS/generate_upload_and_remove_infra.bash


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

#!/bin/bash

## this script resides in [galsl/fedora_38:tpcds_v2] docker container.
## the container uses the following repo [ https://github.com/galsalomon66/tpc-ds-datagen-to-aws-s3 ] for the dsdgen application.
## the purpose of this script it to launch multiple instances of the dsdgen-application(depends on number of cores)
## the flow splits between the very-big-tables and the small tables.
## the num_of_cpu defines the size of parallelism, the num_of_partitions defines the amount chunks that combines togather a single table (it could be huge).
## each cycle of parallel generate-application, ended with flow that uploads the generated files into S3(its done in parallel), upon all files are uploaded
## it removes all files, i.e. for 3TB scale there is no need for 3TB of disk-space (as for S3-storage capacity it obvious ...)


## TODO set by te user
TPCDS_DIR=/tpcds_output/


all_tables="call_center
catalog_page
customer_address
customer
customer_demographics
date_dim
household_demographics
income_band
item
promotion
reason
ship_mode
store
time_dim
warehouse
web_page
web_site
catalog_returns
catalog_sales
web_returns
web_sales
store_returns
store_sales"

#big tables and also parent
#parent table means it got a child table, i.e. there is a relation between them.
parent_tables="store_sales catalog_sales web_sales inventory"

## not a parent table
standalone_tables="call_center catalog_page customer_address customer customer_demographics date_dim household_demographics income_band
item promotion reason ship_mode store time_dim warehouse web_page web_site"
#small_tables=""

num_of_cpu=56
num_of_partitions=0

create_dsdgen_workers_non_parent_tables()
{

[ ! -d ${TPCDS_DIR} ] && echo ${TPCDS_DIR} not exist && exit
num_of_partitions=$(echo 1 | awk -v sc=${SCALE} -v c=${num_of_cpu} '{print int((sc/1000)*c);}')
if [  $num_of_partitions -le 1 ]
then 
	num_of_partitions=2
fi

echo "small tables="num_of_partitions=${num_of_partitions}

((i=1))

for t in ${standalone_tables}
do
        for c in $(seq 1 ${num_of_partitions})
        do
		## the command line defines which table, what scale(size), paratition size, what partition to produce and where to produce it.
        	echo "time ./dsdgen -dir ${TPCDS_DIR} -table ${t} -scale ${SCALE} -force -parallel ${num_of_partitions} -child ${c} &" >> generate_upload_and_remove_exec.bash
		## number of CPU
        	if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
       		 then
                	echo wait >> generate_upload_and_remove_exec.bash
			# upon complete with wait, loop on generated dat files, upload each in parallel, each upload is done, remove file
			# upload && remove
			# 
			echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
        	fi
        done
done
echo wait >> generate_upload_and_remove_exec.bash
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
echo "echo small tables done." >> generate_upload_and_remove_exec.bash

chmod +x generate_upload_and_remove_exec.bash
}

create_dsdgen_workers()
{

[ ! -d ${TPCDS_DIR} ] && echo ${TPCDS_DIR} not exist && exit
num_of_partitions=$(echo 1 | awk -v sc=${SCALE} -v c=${num_of_cpu} '{print int((sc/10)*c);}')
echo "big tables="num_of_partitions=${num_of_partitions}
if [  $num_of_partitions -le 1 ]
then 
	num_of_partitions=2
fi

((i=1))
touch generate_upload_and_remove_exec.bash
rm -f generate_upload_and_remove_exec.bash

echo "#!/bin/bash" >> generate_upload_and_remove_exec.bash
## upload_and_remove_func.bash include functions for upload and remove
echo ". generate_upload_and_remove_infra.bash" >> generate_upload_and_remove_exec.bash
echo "cd /tpc-ds-datagen-to-aws-s3/tpc-ds/v2.11.0rc2/tools" >> generate_upload_and_remove_exec.bash

for t in ${parent_tables}
do
        for c in $(seq 1 ${num_of_partitions})
        do
        	echo "time ./dsdgen -dir ${TPCDS_DIR} -table ${t} -scale ${SCALE} -force -parallel ${num_of_partitions} -child ${c} &" >> generate_upload_and_remove_exec.bash
		## number of CPU
        	if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
       		 then
                	echo wait >> generate_upload_and_remove_exec.bash
			# upon complete with wait, loop on generated dat files, upload each in parallel, each upload is done, remove file
			# upload && remove
			# 
			echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
        	fi
        done
done
echo wait >> generate_upload_and_remove_exec.bash
echo upload_and_remove_worker_func >> generate_upload_and_remove_exec.bash
echo "echo big tables done." >> generate_upload_and_remove_exec.bash

## adding the production of the other tables
create_dsdgen_workers_non_parent_tables

chmod +x generate_upload_and_remove_exec.bash

## the generated script bellow contains all is needed for creating TPCDS tables in S3-storage.
## should execute by the user
#./generate_upload_and_remove_exec.bash

}

upload_and_remove_worker_func()
{
# create list of tasks to run in background, remove each uploaded file upon completion 
(i=0)
touch upload_and_remove_exec.bash
rm -f upload_and_remove_exec.bash

echo "#!/bin/bash" >> upload_and_remove_exec.bash

for f in $(ls ${TPCDS_DIR}/*.dat)
do
	#echo $f
	table_name=$(basename $f | sed 's/_[0-9]\+_[0-9]\+/ /' | awk '{print $1;}')
	echo "(aws s3api put-object --bucket hive --key scale_${SCALE}/${table_name}/$(basename $f) --body ${f} --endpoint-url ${S3_ENDPOINT} > /dev/null 2>&1 && echo upload ${f} && rm -f ${f}) &" >> upload_and_remove_exec.bash 
	if [ $(( i++ % ${num_of_cpu} )) -eq 0 ]
       	then
		echo wait >> upload_and_remove_exec.bash
	fi
done

echo wait >> upload_and_remove_exec.bash
#upload and remove all generated files 
chmod +x upload_and_remove_exec.bash
cp upload_and_remove_exec.bash upload_and_remove.bash_${RANDOM} ## debug

## start upload and remove in parallel
./upload_and_remove_exec.bash

}