secondo/Algebras/Hadoop/clusterManagement/ParallelSecondoConfig.ini

# This file is part of SECONDO.

# Copyright (C) 2004, University in Hagen, Department of Computer Science,
# Database Systems for New Applications.

# SECONDO is free software;  you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation;  either version 2 of the License, or
# (at your option) any later version.

# SECONDO is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY;  without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with SECONDO;  if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# --- Parallel Secondo Configuration ---
# This file is specially prepared for Parallel Secondo Configuration
#
# Note: every parameter explained below can be overruled by an environment
# variable with name SECONDO_PARAM_<keyname>,
# e.g.: export SECONDO_PARAM_SecondoHome="/tmp/databases"

# If a configuration is denoted with an equal sign, then this value is set by its value only
# If it is denoted with a += sign, then it is set by a sequence of values

[Secondo]
# Settings in this environment are not provided by Parallel Secondo yet.
# Here set parameters for Secondo configurations of all data servers.
# All values lists here will be inserted into all data servers' SecondoConfig.ini file
# and be distributed later with ps-secondo-buildMini script
# Each parameter is set as : [Environment]:[title] = [value]
# e.g.: if BerkeleyDB:CacheSize = 65536 is set here, then all data servers' Secondo databases are
# set with cache space of 64MB
# Nontheless, parameters about parallel Secondo,
# including SecondoFilePath, Environment:SecondoPort, Environment:SecondoHome etc are not allowed be set here,
# or else interferences may happen on data servers share a same cluster node.


[Hadoop]
# This section is used to set configurations in Hadoop,
# based on which the parallel Secondo is built up.
# A option configuration is made by two parts: [fileName]:[title] = [value]
# At present, four files in $HADOOP_HOME/conf/ can be recognized and edited
# hadoop-env.sh , core-site.xml, hdfs-site.xml and mapred-site.xml
# On every cluster node, its $HADOOP_HOME and HDFS path always be set at its first data server path $PARALLEL_SECONDO_MAINDS
# e.g.: in Mac OSX 10.5, it should be set as hadoop-env.sh:JAVA_HOME = /System/Library/Frameworks/JavaVM.framework/Versions/1.6.0/Home

hadoop-env.sh:JAVA_HOME = /usr/lib/jvm/java-6-openjdk

# Following parameters are specially prepared for parallel Secondo,
# Except for advanced users, changing them is unnecessary
hadoop-env.sh:HADOOP_OPTS += -Dhostname=$HOSTNAME
hadoop-env.sh:HADOOP_OPTS += -Dhome=$HOME
hadoop-env.sh:HADOOP_OPTS += -Duser=$USER
hadoop-env.sh:HADOOP_OPTS += -Dmpath=$PARALLEL_SECONDO_MAINDS
hadoop-env.sh:HADOOP_OPTS += -Djava.net.preferIPv4Stack=true
hadoop-env.sh:HADOOP_OPTS += -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl
core-site.xml:hadoop.tmp.dir = /tmp/hadoop-${hostname}-${user}
hdfs-site.xml:dfs.name.dir = ${mpath}/HDFS/NAME
hdfs-site.xml:dfs.data.dir = ${mpath}/HDFS/DATA

# By default, following parameters are prepared for
# deploying Parallel Secondo on a single computer with Ubuntu system.
# If you need to deploy it to a computer cluster, then change the host name localhost
# to the IP address of the node indicated as the master node of the parallel Secondo
# If the cluster is shared by several users,
# then setting different service port numbers for each of them.
core-site.xml:fs.default.name = hdfs://localhost:49000
hdfs-site.xml:dfs.http.address = localhost:50070
hdfs-site.xml:dfs.secondary.http.address = localhost:50090
mapred-site.xml:mapred.job.tracker = localhost:49001
mapred-site.xml:mapred.job.tracker.http.address = localhost:50030

# Setting how many map/reduce tasks can run in parallel on one node of the cluster
# Normally it is the two times of the core number
# it is set as 12 since our cluster has six processor cores in every computer
mapred-site.xml:mapred.tasktracker.map.tasks.maximum = 12
mapred-site.xml:mapred.tasktracker.reduce.tasks.maximum = 12
mapred-site.xml:mapred.reduce.parallel.copies = 12
mapred-site.xml:mapred.task.timeout = 6000000
# hdfs-site.xml:dfs.replication = 1


# Following parameters are prepared only for several users sharing a cluster.
# If so, then setting different service port numbers are required for different users.
# hdfs-site.xml:dfs.datanode.address = 0.0.0.0:50010
# hdfs-site.xml:dfs.datanode.http.address = 0.0.0.0:50075
# hdfs-site.xml:dfs.datanode.ipc.address = 0.0.0.0:50020
# hdfs-site.xml:dfs.datanode.https.address = 0.0.0.0:50475
# hdfs-site.xml:dfs.https.address = 0.0.0.0:50470

# mapred-site.xml:mapred.task.tracker.http.address = 0.0.0.0:51060


[Cluster]
# Each node is defined by three parameters: IP address, server path and Secondo Port
# delimited by a colon.
# The IP address cannot be replaced by host names or loopback address,
# also the server path must be absolute paths.
# Ports and data server folders must be different from each other.
# e.g.: Master = 192.168.0.1:/Home/DataServer:1234
# Only one node can be set as the master data server in parallel Secondo,
# if the Master is set several times, then only the latest configuration is available
# Slave nodes are set by using the '+=' mark, and can be set for several times
# In principle, slaves should be different from each other
# A master can also be set as a slave.

# Master = 192.168.0.1:/Home/dataServer1:11234
# Slaves += 192.168.0.1:/Home/dataServer2:12234
# Slaves += 192.168.0.1:/Home/dataServer3:13234

[Options]
# Here defines some special configuration for parallel Secondo
# Each defines as [title] = [value]

# NS4Master is the abbreviation for "Normal Secondo for Master Mini Secondo"
# If this feature is set as true, then user's normal Secondo database will be
# used as the master in parallel Secondo.
# NS4Master = true