Files
secondo/Algebras/Hadoop/Parallel_BerlinMOD/generator/genParaBerlinMOD.sh

199 lines
5.6 KiB
Bash
Raw Normal View History

2026-01-23 17:03:45 +08:00
#!/bin/bash
# This script generates a BerlinMOD data set in parallel on a cluster.
# It can only run after Parallel Secondo has been correctly installed.
# Besides, it needs following prerequisites:
# * Distribute the data files streets, homeRegions and workRegions to the cluster.
# * A Hadoop program named GenMOD.jar
# * A set of Secondo scripts, include:
# - BerlinMOD_DataGenerator_map.SEC (Generate data on slaves in Map stage)
# - BerlinMOD_DataGenerator_reduce.SEC (Generate data on slaves in Reduce stage)
# - BerlinMOD_DataGenerator_master1.SEC (Set global parameters on the master database)
# - BerlinMOD_DataGenerator_master2.SEC (Collect distributed data on the master at last)
# * This script must run on the master node of the cluster.
bin=`dirname "$0"`
bin=`cd "$bin"; pwd`
WARNINFO="Warning !! "
ERRORINFO="ERROR !! "
DBNAME="berlinmod"
SCALEFACTOR=0.01
P_NUMDAYS=0
GALONE=false
# 0. Process the arguments.
declare -i numOfArgs=$#
let numOfArgs++
while [ $numOfArgs -ne $OPTIND ]; do
getopts "hd:s:p:l" optKey
if [ "$optKey" == "?" ]; then
optKey="h"
fi
case $optKey in
h)
echo -en "Usage of ${0##*/}:\n\n"
echo -en " -h Print this message and exit. \n\n"
echo -en " -d Set the created Database name (default) ${DBNAME} \n\n"
echo -en " -s Set the Scale factor of the data set (default) ${SCALEFACTOR} \n\n"
echo -en " -p Set the Period of simulation by days (default) NULL \n\n"
echo -en " -l Generate the data set Lonely on one computer (default) ${GALONE} \n\n "
exit 0
;;
d)
DBNAME="${OPTARG}"
if [ -z "${DBNAME}" ]; then
echo -en "${ERRORINFO}The database name cannot be empty. \n\n"
exit -1
fi
;;
s)
SCALEFACTOR=${OPTARG}
CHK=$(echo ${SCALEFACTOR/./} | grep "^[0-9]*$")
if [ -z "$CHK" ]; then
echo -en "${ERRORINFO}The input scale factor ${SCALEFACTOR} is invalid.\n\n"
exit -1
elif [ "${CHK}" = "${SCALEFACTOR}" ]; then
echo -en "${ERRORINFO}The scale factor ${SCALEFACTOR} should be a real number.\n\n"
exit -1
fi
;;
p)
P_NUMDAYS=${OPTARG}
CHK=$(echo ${P_NUMDAYS/./} | grep "^[0-9]*$")
if [ -z "$CHK" ]; then
echo -en "${ERRORINFO}The simulate period ${P_NUMDAYS} is invalid.\n\n"
exit -1
elif [ "${CHK}" != "${P_NUMDAYS}" ]; then
echo -en "${ERRORINFO}The simulate period ${P_NUMDAYS} should be an integer.\n\n"
exit -1
elif [ ${P_NUMDAYS} -le 0 ]; then
echo -en "${ERRORINFO}The simulate period ${P_NUMDAYS} should at least be one day.\n\n"
exit -1
fi
;;
l)
GALONE=true
;;
esac
done
# Check the installation of Parallel SECONDO
if [ ! -d ${PARALLEL_SECONDO_MAINDS}/bin ]; then
echo -en "${ERRORINFO}The Parallel Secondo is not correctly installed. \n\n"
exit -1
fi
source ${PARALLEL_SECONDO_MAINDS}/bin/ps-functions
SUBDBNAME="ps${DBNAME}"
if [ ${#SUBDBNAME} -gt 16 ]; then
echo -en "${ERRORINFO}The database name ${DBNAME} is too long. \n\n"
exit -1
fi
tcmd=$(which ps-startTTYCS)
if [ $? -ne 0 ]; then
echo -en "${ERRORINFO}The Parallel Secondo is not correctly installed. \n\n"
exit -1
fi
if [ ! -f "${bin}/BerlinMOD_DataGenerator_master1.SEC" ] \
|| [ ! -f "${bin}/BerlinMOD_DataGenerator_master2.SEC" ] \
|| [ ! -f "${bin}/BerlinMOD_DataGenerator_map.SEC" ] \
|| [ ! -f "${bin}/BerlinMOD_DataGenerator_reduce.SEC" ] \
|| [ ! -f "${bin}/GenMOD.jar" ]; then
echo -en "${ERRORINFO}Cannot find SECONDO scripts and the hadoop program. \n\n"
exit -1
fi
CONFFILE=${PARALLEL_SECONDO_MAINDS}/${PARALLEL_SECONDO_MINI_NAME}/bin/SecondoConfig.ini
if [ ! -f $CONFFILE ]; then
echo -en "${ERRORINFO}The configuration of the Parallel Secondo is wrong. \n\n"
exit -1
fi
LFNAME="/tmp/SECMON_$(get_secPort $CONFFILE)"
if ${GALONE} ; then
if [ -f ${LFNAME} ]; then
echo -en "${ERRORINFO}Stop the running monitors first if you like generate data on a single computer. \n\n"
exit -1
fi
ps-startTTY -s 1 <<< "delete database ${DBNAME};
create database ${DBNAME};
open database ${DBNAME};
let SCALEFACTOR = ${SCALEFACTOR};
close database;
q;"
if [ ${P_NUMDAYS} -gt 0 ]; then
ps-startTTY -s 1 <<< "open database ${DBNAME};
let P_NUMDAYS = ${P_NUMDAYS};
close database;
q;"
fi
ps-startTTY -s 1 <<< "open database ${DBNAME};
@@${bin}/BerlinMOD_DataGenerator_map.SEC
@@${bin}/BerlinMOD_DataGenerator_reduce.SEC
close database;
q;"
exit 0
fi
if [ ! -f ${LFNAME} ]; then
echo -en "${ERRORINFO}The monitors are not started. \n\n"
exit -1
fi
ps-startTTYCS -s 1 <<< "delete database ${DBNAME};
create database ${DBNAME};
open database ${DBNAME};
let SCALEFACTOR = ${SCALEFACTOR};
close database;
q;"
if [ ${P_NUMDAYS} -gt 0 ]; then
ps-startTTYCS -s 1 <<< "open database ${DBNAME};
let P_NUMDAYS = ${P_NUMDAYS};
close database;
q;"
fi
ps-startTTYCS -s 1 <<< "open database ${DBNAME};
@@${bin}/BerlinMOD_DataGenerator_master1.SEC
close database;
q;"
if [ $? -ne 0 ];then
echo -en "${ERROR}Initialization on the master DS fails.\n\n"
exit -1
fi
hadoop jar ${bin}/GenMOD.jar GenMOD ${SUBDBNAME} ${SCALEFACTOR} ${P_NUMDAYS} ${bin}/BerlinMOD_DataGenerator_map.SEC ${bin}/BerlinMOD_DataGenerator_reduce.SEC
if [ $? -ne 0 ];then
echo -en "${ERROR}The Hadoop program fails.\n\n"
exit -1
fi
RESULTFILE=$bin/.genResult
hadoop dfs -cat OUTPUT-MOD/p* > ${RESULTFILE}
RESULT="("
while read ROW DS COLUMN; do
RESULT="${RESULT}(${ROW} ${DS} ${COLUMN} '') "
done < ${RESULTFILE}
RESULT="${RESULT})"
ps-startTTYCS -s 1 <<< "open database ${DBNAME};
let locRel = [const rel(tuple([Row:int, DS:int, Column:int, Path:text])) value ${RESULT} ];
@@${bin}/BerlinMOD_DataGenerator_master2.SEC
close database;
q;"