/* * Sccsid: @(#)rnd.c 9.1.1.9 9/7/95 16:24:04 * * RANDOM.C -- Implements Park & Miller's "Minimum Standard" RNG * * (Reference: CACM, Oct 1988, pp 1192-1201) * * NextRand: Computes next random integer * UnifInt: Yields an long uniformly distributed between given bounds * UnifReal: ields a real uniformly distributed between given bounds * Exponential: Yields a real exponentially distributed with given mean * */ #include "config.h" #include #include #include "dss.h" #include "rnd.h" char *env_config PROTO((char *tag, char *dflt)); #define PI (3.141597) /****************************************************************** NextRand: Computes next random integer *******************************************************************/ /* * long NextRand( long nSeed ) */ long NextRand(long nSeed) /* * nSeed is the previous random number; the returned value is the * next random number. The routine generates all numbers in the * range 1 .. nM-1. */ { /* * The routine returns (nSeed * nA) mod nM, where nA (the * multiplier) is 16807, and nM (the modulus) is * 2147483647 = 2^31 - 1. * * nM is prime and nA is a primitive element of the range 1..nM-1. * This * means that the map nSeed = (nSeed*nA) mod nM, starting * from any nSeed in 1..nM-1, runs through all elements of 1..nM-1 * before repeating. It never hits 0 or nM. * * To compute (nSeed * nA) mod nM without overflow, use the * following trick. Write nM as nQ * nA + nR, where nQ = nM / nA * and nR = nM % nA. (For nM = 2147483647 and nA = 16807, * get nQ = 127773 and nR = 2836.) Write nSeed as nU * nQ + nV, * where nU = nSeed / nQ and nV = nSeed % nQ. Then we have: * * nM = nA * nQ + nR nQ = nM / nA nR < nA < nQ * * nSeed = nU * nQ + nV nU = nSeed / nQ nV < nU * * Since nA < nQ, we have nA*nQ < nM < nA*nQ + nA < nA*nQ + nQ, * i.e., nM/nQ = nA. This gives bounds on nU and nV as well: * nM > nSeed => nM/nQ * >= nSeed/nQ => nA >= nU ( > nV ). * * Using ~ to mean "congruent mod nM" this gives: * * nA * nSeed ~ nA * (nU*nQ + nV) * * ~ nA*nU*nQ + nA*nV * * ~ nU * (-nR) + nA*nV (as nA*nQ ~ -nR) * * Both products in the last sum can be computed without overflow * (i.e., both have absolute value < nM) since nU*nR < nA*nQ < nM, * and nA*nV < nA*nQ < nM. Since the two products have opposite * sign, their sum lies between -(nM-1) and +(nM-1). If * non-negative, it is the answer (i.e., it's congruent to * nA*nSeed and lies between 0 and nM-1). Otherwise adding nM * yields a number still congruent to nA*nSeed, but now between * 0 and nM-1, so that's the answer. */ long nU, nV; nU = nSeed / nQ; nV = nSeed - nQ * nU; /* i.e., nV = nSeed % nQ */ nSeed = nA * nV - nU * nR; if (nSeed < 0) nSeed += nM; return (nSeed); } /****************************************************************** UnifInt: Yields an long uniformly distributed between given bounds *******************************************************************/ /* * long UnifInt( long nLow, long nHigh, long nStream ) */ long UnifInt(long nLow, long nHigh, long nStream) /* * Returns an integer uniformly distributed between nLow and nHigh, * including * the endpoints. nStream is the random number stream. * Stream 0 is used if nStream is not in the range 0..MAX_STREAM. */ { double dRange; long nTemp; if (nStream < 0 || nStream > MAX_STREAM) nStream = 0; if (nLow == nHigh) return (nLow); if (nLow > nHigh) { nTemp = nLow; nLow = nHigh; nHigh = nTemp; } dRange = (double) (nHigh - nLow + 1); Seed[nStream] = NextRand(Seed[nStream]); nTemp = (long) (((double) Seed[nStream] / dM) * (dRange)); return (nLow + nTemp); } /****************************************************************** UnifReal: Yields a real uniformly distributed between given bounds *******************************************************************/ /* * double UnifReal( double dLow, double dHigh, long nStream ) */ double UnifReal(double dLow, double dHigh, long nStream) /* * Returns a double uniformly distributed between dLow and dHigh, * excluding the endpoints. nStream is the random number stream. * Stream 0 is used if nStream is not in the range 0..MAX_STREAM. */ { double dTemp; if (nStream < 0 || nStream > MAX_STREAM) nStream = 0; if (dLow == dHigh) return (dLow); if (dLow > dHigh) { dTemp = dLow; dLow = dHigh; dHigh = dTemp; } Seed[nStream] = NextRand(Seed[nStream]); dTemp = ((double) Seed[nStream] / dM) * (dHigh - dLow); return (dLow + dTemp); } /******************************************************************% Exponential: Yields a real exponentially distributed with given mean *******************************************************************/ /* * double Exponential( double dMean, long nStream ) */ double Exponential(double dMean, long nStream) /* * Returns a double uniformly distributed with mean dMean. * 0.0 is returned iff dMean <= 0.0. nStream is the random number * stream. Stream 0 is used if nStream is not in the range * 0..MAX_STREAM. */ { double dTemp; if (nStream < 0 || nStream > MAX_STREAM) nStream = 0; if (dMean <= 0.0) return (0.0); Seed[nStream] = NextRand(Seed[nStream]); dTemp = (double) Seed[nStream] / dM; /* unif between 0..1 */ return (-dMean * log(1.0 - dTemp)); } /* * Extensions to dbgen for generation of skewed data. * Surajit Chaudhuri, Vivek Narasayya. * (Jan '99) */ #define EPSILON (0.0001) /* * Round the number x to the nearest integer. */ double round(double x) { double fx = floor(x); double cx = ceil(x); if((x-fx) <= (cx-x)) return fx; else return cx; } /* * Perform binary search to find D (number of distinct values for given n, zipf) * */ double SolveForMultipler(long n, double zipf) { long lowD = 1; long highD = n; long mid; double sumMult; double nRowsPrime; long numRows; long i; while ( (highD - lowD) > 1) { mid = (highD + lowD) / 2; sumMult = 0.0; /* compute multiplier */ for(i=1;i<=mid;i++) { sumMult += 1.0 / (pow((double) i, zipf)); } /* compute number of rows generated for this mulitpler */ numRows = 0; for(i=1;i<=mid;i++) { nRowsPrime = ((n / sumMult) / pow((double) i, zipf)); numRows += (long) round(nRowsPrime); } if(((double)(n-numRows))/n < EPSILON) { break; } if(numRows > n) { /* we overestimated numRows -- we need fewer distinct values */ highD = mid; } else { /* underestimation of numRows -- need lower multiplier */ lowD = mid; } } return sumMult; } double GetMultiplier(long n, double zipf) { double multiplier; double term; long i; if(zipf == 0.0) multiplier = n; else if(zipf==1.0) multiplier = log(n) + 0.577 ; else if(zipf==2.0) multiplier = (PI*PI)/6.0; else if(zipf==3.0) multiplier = 1.202; else if(zipf==4.0) multiplier = (PI*PI*PI*PI)/90.0; else { /* compute multiplier (must be within given bounds) */ multiplier = 0; for(i=1;i<=n;i++) { term = 1.0 / pow((double)i, zipf); multiplier += term; /* if later terms add very little we can stop */ if(term < EPSILON) break; } } return multiplier; } /******************************************************************% RANDOM: Yields a random number from a Zipfian distribution with a specified skewVal. Skew is an integer in the range 0..3 *******************************************************************/ /* * */ long SkewInt(long nLow, long nHigh, long nStream, double skewVal, long n) /* * */ { double zipf; double dRange; long nTemp; double multiplier; double Czn; long numDistinctValuesGenerated; /* check for validity of skewVal */ if(skewVal < 0 || skewVal > 5) zipf = 0; /* assume uniform */ else if(skewVal==5.0) { /* special case */ /* check if any values have been generated for this column */ if(NumDistinctValuesGenerated[nStream]==0) { /* pick a skew value to be used for this column*/ zipf = (int) UnifInt(0, 4, 0); ColumnSkewValue[nStream] = zipf; } else { /* column skew value has been initialized before */ zipf = ColumnSkewValue[nStream]; } } else { /* skewVal is between 0 and 4 */ zipf = skewVal; } /* If no values have been generated for this stream as yet, get multiplier */ if(NumDistinctValuesGenerated[nStream]==0) { Multiplier[nStream] = GetMultiplier(n, zipf); } multiplier = Multiplier[nStream]; /* * Check how many copies of the current value * have already been generated for this stream. * If we have generated enough, proceed to * next value, and decide how many copies of * this value should be generated. */ if(CurrentValueCounter[nStream] == CurrentValueTarget[nStream]) { /* proceed to next value */ if (nStream < 0 || nStream > MAX_STREAM) nStream = 0; if (nLow == nHigh) nTemp = nLow; else { if (nLow > nHigh) { nTemp = nLow; nLow = nHigh; nHigh = nTemp; } dRange = (double) (nHigh - nLow + 1); Seed[nStream] = NextRand(Seed[nStream]); nTemp = (long) (((double) Seed[nStream] / dM) * (dRange)); nTemp += nLow; CurrentValue[nStream] = nTemp; } } else { /* return another copy of current value */ nTemp = CurrentValue[nStream]; CurrentValueCounter[nStream]++; return nTemp; } /* * check how many distinct values for this column * have already been generated. */ numDistinctValuesGenerated = NumDistinctValuesGenerated[nStream] + 1; if(n<1) n = (nHigh - nLow + 1); /* * zipf is guaranteed to be in the range 0..4 * pick the multiplier if(zipf == 0) multiplier = n; else if(zipf==1) multiplier = log(n) + 0.577 ; else if(zipf==2) multiplier = (PI*PI)/6.0; else if(zipf==3) multiplier = 1.202; else if(zipf==4) multiplier = (PI*PI*PI*PI)/90.0; */ Czn = n/multiplier; CurrentValueTarget[nStream]= (long) (Czn/pow((double)numDistinctValuesGenerated, zipf)); /* ensure that there is at least one value*/ CurrentValueTarget[nStream] = (CurrentValueTarget[nStream] < 1) ? 1: CurrentValueTarget[nStream]; CurrentValueCounter[nStream] = 1; NumDistinctValuesGenerated[nStream]++; return nTemp; } /******************************************************************% Returns the number of copies of the next value in the given distribution. Skew is an integer in the range 0..4 *******************************************************************/ /* * */ long SkewIntCount(long nStream, double skewVal, long n) /* * */ { double zipf; double multiplier; double Czn; long numDistinctValuesGenerated; long numCopies; /* check for validity of skewVal */ if(skewVal < 0 || skewVal > 4) zipf = 0; /* assume uniform */ else if(skewVal==5.0) { /* column skew value has been initialized before */ zipf = ColumnSkewValue[nStream]; } else { /* skewVal is between 0 and 4 */ zipf = skewVal; } /* * check how many distinct values for this column * have already been generated. */ numDistinctValuesGenerated = NumDistinctValuesGenerated[nStream] + 1; multiplier = Multiplier[nStream]; /* * zipf is guaranteed to be in the range 0..4 * pick the multiplier if(zipf == 0) multiplier = n; else if(zipf==1) multiplier = log(n) + 0.577 ; else if(zipf==2) multiplier = (PI*PI)/6.0; else if(zipf==3) multiplier = 1.202; else if(zipf==4) multiplier = (PI*PI*PI*PI)/90.0; */ Czn = n/multiplier; numCopies= (long) (Czn/pow((double)numDistinctValuesGenerated, zipf)); /* ensure that there is at least one value*/ CurrentValueTarget[nStream] = (CurrentValueTarget[nStream] < 1) ? 1: CurrentValueTarget[nStream]; NumDistinctValuesGenerated[nStream]++; return numCopies; }