* M_APM - mapmfmul.c
*
* Copyright (C) 1999 - 2007 Michael C. Ring
*
* Permission to use, copy, and distribute this software and its
* documentation for any purpose with or without fee is hereby granted,
* provided that the above copyright notice appear in all copies and
* that both that copyright notice and this permission notice appear
* in supporting documentation.
*
* Permission to modify the software is granted. Permission to distribute
* the modified code is granted. Modifications are to be distributed by
* using the file 'license.txt' as a template to modify the file header.
* 'license.txt' is available in the official MAPM distribution.
*
* This software is provided "as is" without express or implied warranty.
*/
* $Id: mapmfmul.c,v 1.33 2007/12/03 01:52:22 mike Exp $
*
* This file contains the divide-and-conquer FAST MULTIPLICATION
* function as well as its support functions.
*
* $Log: mapmfmul.c,v $
* Revision 1.33 2007/12/03 01:52:22 mike
* Update license
*
* Revision 1.32 2004/02/18 03:16:15 mike
* optimize 4 byte multiply (when FFT is disabled)
*
* Revision 1.31 2003/12/04 01:14:06 mike
* redo math on 'borrow'
*
* Revision 1.30 2003/07/21 20:34:18 mike
* Modify error messages to be in a consistent format.
*
* Revision 1.29 2003/03/31 21:55:07 mike
* call generic error handling function
*
* Revision 1.28 2002/11/03 22:38:15 mike
* Updated function parameters to use the modern style
*
* Revision 1.27 2002/02/14 19:53:32 mike
* add conditional compiler option to disable use
* of FFT multiply if the user so chooses.
*
* Revision 1.26 2001/07/26 20:56:38 mike
* fix comment, no code change
*
* Revision 1.25 2001/07/16 19:43:45 mike
* add function M_free_all_fmul
*
* Revision 1.24 2001/02/11 22:34:47 mike
* modify parameters to REALLOC
*
* Revision 1.23 2000/10/20 19:23:26 mike
* adjust power_of_2 function so it should work with
* 64 bit processors and beyond.
*
* Revision 1.22 2000/08/23 22:27:34 mike
* no real code change, re-named 2 local functions
* so they make more sense.
*
* Revision 1.21 2000/08/01 22:24:38 mike
* use sizeof(int) function call to stop some
* compilers from complaining.
*
* Revision 1.20 2000/07/19 17:12:00 mike
* lower the number of bytes that the FFT can handle. worst case
* testing indicated math overflow when >= 1048576
*
* Revision 1.19 2000/07/08 18:29:03 mike
* increase define so FFT can handle bigger numbers
*
* Revision 1.18 2000/07/06 23:20:12 mike
* changed my mind. use static local MAPM numbers
* for temp data storage
*
* Revision 1.17 2000/07/06 20:52:34 mike
* use init function to get local writable copies
* instead of using the stack
*
* Revision 1.16 2000/07/04 17:25:09 mike
* guarantee 16 bit compilers still work OK
*
* Revision 1.15 2000/07/04 15:40:02 mike
* add call to use FFT algorithm
*
* Revision 1.14 2000/05/05 21:10:46 mike
* add comment indicating availability of assembly language
* version of M_4_byte_multiply for Linux on x86 platforms.
*
* Revision 1.13 2000/04/20 19:30:45 mike
* minor optimization to 4 byte multiply
*
* Revision 1.12 2000/04/14 15:39:30 mike
* optimize the fast multiply function. don't re-curse down
* to a size of 1. recurse down to a size of '4' and then
* call a special 4 byte multiply function.
*
* Revision 1.11 2000/02/03 23:02:13 mike
* put in RCS for real...
*
* Revision 1.10 2000/02/03 22:59:08 mike
* remove the extra recursive function. not needed any
* longer since all current compilers should not have
* any problem with true recursive calls.
*
* Revision 1.9 2000/02/03 22:47:39 mike
* use MAPM_* generic memory function
*
* Revision 1.8 1999/09/19 21:13:44 mike
* eliminate unneeded local int in _split
*
* Revision 1.7 1999/08/12 22:36:23 mike
* move the 3 'simple' function to the top of file
* so GCC can in-line the code.
*
* Revision 1.6 1999/08/12 22:01:14 mike
* more minor optimizations
*
* Revision 1.5 1999/08/12 02:02:06 mike
* minor optimization
*
* Revision 1.4 1999/08/10 22:51:59 mike
* minor tweak
*
* Revision 1.3 1999/08/10 00:45:47 mike
* added more comments and a few minor tweaks
*
* Revision 1.2 1999/08/09 02:50:02 mike
* add some comments
*
* Revision 1.1 1999/08/08 18:27:57 mike
* Initial revision
*/
#include "m_apm_lc.h"
static int M_firsttimef = TRUE;
* specify the max size the FFT routine can handle
* (in MAPM, #digits = 2 * #bytes)
*
* this number *must* be an exact power of 2.
*
* **WORST** case input numbers (all 9's) has shown that
* the FFT math will overflow if the #define here is
* >= 1048576. On my system, 524,288 worked OK. I will
* factor down another factor of 2 to safeguard against
* other computers have less precise floating point math.
* If you are confident in your system, 524288 will
* theoretically work fine.
*
* the define here allows the FFT algorithm to multiply two
* 524,288 digit numbers yielding a 1,048,576 digit result.
*/
#define MAX_FFT_BYTES 262144
* the Divide-and-Conquer multiplication kicks in when the size of
* the numbers exceed the capability of the FFT (#define just above).
*
* #bytes D&C call depth
* ------ --------------
* 512K 1
* 1M 2
* 2M 3
* 4M 4
* ... ...
* 2.1990E+12 23
*
* the following stack sizes are sized to meet the
* above 2.199E+12 example, though I wouldn't want to
* wait for it to finish...
*
* Each call requires 7 stack variables to be saved so
* we need a stack depth of 23 * 7 + PAD. (we use 164)
*
* For 'exp_stack', 3 integers also are required to be saved
* for each recursive call so we need a stack depth of
* 23 * 3 + PAD. (we use 72)
*
*
* If the FFT multiply is disabled, resize the arrays
* as follows:
*
* the following stack sizes are sized to meet the
* worst case expected assuming we are multiplying
* numbers with 2.14E+9 (2 ^ 31) digits.
*
* For sizeof(int) == 4 (32 bits) there may be up to 32 recursive
* calls. Each call requires 7 stack variables so we need a
* stack depth of 32 * 7 + PAD. (we use 240)
*
* For 'exp_stack', 3 integers also are required to be saved
* for each recursive call so we need a stack depth of
* 32 * 3 + PAD. (we use 100)
*/
#ifdef NO_FFT_MULTIPLY
#define M_STACK_SIZE 240
#define M_ISTACK_SIZE 100
#else
#define M_STACK_SIZE 164
#define M_ISTACK_SIZE 72
#endif
static int exp_stack[M_ISTACK_SIZE];
static int exp_stack_ptr;
static UCHAR *mul_stack_data[M_STACK_SIZE];
static int mul_stack_data_size[M_STACK_SIZE];
static int M_mul_stack_ptr;
static UCHAR *fmul_a1, *fmul_a0, *fmul_a9, *fmul_b1, *fmul_b0,
*fmul_b9, *fmul_t0;
static int size_flag, bit_limit, stmp, itmp, mii;
static M_APM M_ain;
static M_APM M_bin;
static char *M_stack_ptr_error_msg = "\'M_get_stack_ptr\', Out of memory";
extern void M_fast_multiply(M_APM, M_APM, M_APM);
extern void M_fmul_div_conq(UCHAR *, UCHAR *, UCHAR *, int);
extern void M_fmul_add(UCHAR *, UCHAR *, int, int);
extern int M_fmul_subtract(UCHAR *, UCHAR *, UCHAR *, int);
extern void M_fmul_split(UCHAR *, UCHAR *, UCHAR *, int);
extern int M_next_power_of_2(int);
extern int M_get_stack_ptr(int);
extern void M_push_mul_int(int);
extern int M_pop_mul_int(void);
#ifdef NO_FFT_MULTIPLY
extern void M_4_byte_multiply(UCHAR *, UCHAR *, UCHAR *);
#else
extern void M_fast_mul_fft(UCHAR *, UCHAR *, UCHAR *, int);
#endif
* the following algorithm is used in this fast multiply routine
* (sometimes called the divide-and-conquer technique.)
*
* assume we have 2 numbers (a & b) with 2N digits.
*
* let : a = (2^N) * A1 + A0 , b = (2^N) * B1 + B0
*
* where 'A1' is the 'most significant half' of 'a' and
* 'A0' is the 'least significant half' of 'a'. Same for
* B1 and B0.
*
* Now use the identity :
*
* 2N N N N
* ab = (2 + 2 ) A1B1 + 2 (A1-A0)(B0-B1) + (2 + 1)A0B0
*
*
* The original problem of multiplying 2 (2N) digit numbers has
* been reduced to 3 multiplications of N digit numbers plus some
* additions, subtractions, and shifts.
*
* The fast multiplication algorithm used here uses the above
* identity in a recursive process. This algorithm results in
* O(n ^ 1.585) growth.
*/
void M_free_all_fmul()
{
int k;
if (M_firsttimef == FALSE)
{
m_apm_free(M_ain);
m_apm_free(M_bin);
for (k=0; k < M_STACK_SIZE; k++)
{
if (mul_stack_data_size[k] != 0)
{
MAPM_FREE(mul_stack_data[k]);
}
}
M_firsttimef = TRUE;
}
}
void M_push_mul_int(int val)
{
exp_stack[++exp_stack_ptr] = val;
}
int M_pop_mul_int()
{
return(exp_stack[exp_stack_ptr--]);
}
void M_fmul_split(UCHAR *x1, UCHAR *x0, UCHAR *xin, int nbytes)
{
memcpy(x1, xin, nbytes);
memcpy(x0, (xin + nbytes), nbytes);
}
void M_fast_multiply(M_APM rr, M_APM aa, M_APM bb)
{
void *vp;
int ii, k, nexp, sign;
if (M_firsttimef)
{
M_firsttimef = FALSE;
for (k=0; k < M_STACK_SIZE; k++)
mul_stack_data_size[k] = 0;
size_flag = M_get_sizeof_int();
bit_limit = 8 * size_flag + 1;
M_ain = m_apm_init();
M_bin = m_apm_init();
}
exp_stack_ptr = -1;
M_mul_stack_ptr = -1;
m_apm_copy(M_ain, aa);
m_apm_copy(M_bin, bb);
sign = M_ain->m_apm_sign * M_bin->m_apm_sign;
nexp = M_ain->m_apm_exponent + M_bin->m_apm_exponent;
if (M_ain->m_apm_datalength >= M_bin->m_apm_datalength)
ii = M_ain->m_apm_datalength;
else
ii = M_bin->m_apm_datalength;
ii = (ii + 1) >> 1;
ii = M_next_power_of_2(ii);
by the caller: m_apm_multiply
*/
k = 2 * ii;
M_apm_pad(M_ain, k);
M_apm_pad(M_bin, k);
if (k > rr->m_apm_malloclength)
{
if ((vp = MAPM_REALLOC(rr->m_apm_data, (k + 32))) == NULL)
{
M_apm_log_error_msg(M_APM_FATAL, "\'M_fast_multiply\', Out of memory");
}
rr->m_apm_malloclength = k + 28;
rr->m_apm_data = (UCHAR *)vp;
}
#ifdef NO_FFT_MULTIPLY
M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
#else
* if the numbers are *really* big, use the divide-and-conquer
* routine first until the numbers are small enough to be handled
* by the FFT algorithm. If the numbers are already small enough,
* call the FFT multiplication now.
*
* Note that 'ii' here is (and must be) an exact power of 2.
*/
if (size_flag == 2)
{
M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
}
else
{
if (ii > (MAX_FFT_BYTES + 2))
{
M_fmul_div_conq(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
}
else
{
M_fast_mul_fft(rr->m_apm_data, M_ain->m_apm_data,
M_bin->m_apm_data, ii);
}
}
#endif
rr->m_apm_sign = sign;
rr->m_apm_exponent = nexp;
rr->m_apm_datalength = 4 * ii;
M_apm_normalize(rr);
}
* This is the recursive function to perform the multiply. The
* design intent here is to have no local variables. Any local
* data that needs to be saved is saved on one of the two stacks.
*/
void M_fmul_div_conq(UCHAR *rr, UCHAR *aa, UCHAR *bb, int sz)
{
#ifdef NO_FFT_MULTIPLY
if (sz == 4)
{
M_4_byte_multiply(rr, aa, bb);
return;
}
#else
* if the numbers are now small enough, let the FFT algorithm
* finish up.
*/
if (sz == MAX_FFT_BYTES)
{
M_fast_mul_fft(rr, aa, bb, sz);
return;
}
#endif
memset(rr, 0, (2 * sz));
mii = sz >> 1;
itmp = M_get_stack_ptr(mii);
M_push_mul_int(itmp);
fmul_a1 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(mii);
fmul_a0 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(2 * sz);
fmul_a9 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(mii);
fmul_b1 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(mii);
fmul_b0 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(2 * sz);
fmul_b9 = mul_stack_data[itmp];
itmp = M_get_stack_ptr(2 * sz);
fmul_t0 = mul_stack_data[itmp];
M_fmul_split(fmul_a1, fmul_a0, aa, mii);
M_fmul_split(fmul_b1, fmul_b0, bb, mii);
stmp = M_fmul_subtract(fmul_a9, fmul_a1, fmul_a0, mii);
stmp *= M_fmul_subtract(fmul_b9, fmul_b0, fmul_b1, mii);
M_push_mul_int(stmp);
M_push_mul_int(mii);
M_fmul_div_conq(fmul_t0, fmul_a0, fmul_b0, mii);
mii = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();
M_push_mul_int(itmp);
M_push_mul_int(stmp);
M_push_mul_int(mii);
fmul_a1 = mul_stack_data[itmp];
fmul_a0 = mul_stack_data[itmp+1];
fmul_a9 = mul_stack_data[itmp+2];
fmul_b1 = mul_stack_data[itmp+3];
fmul_b0 = mul_stack_data[itmp+4];
fmul_b9 = mul_stack_data[itmp+5];
fmul_t0 = mul_stack_data[itmp+6];
*/
fmul_a1 = mul_stack_data[itmp];
fmul_b1 = mul_stack_data[itmp+3];
fmul_t0 = mul_stack_data[itmp+6];
memcpy((rr + sz), fmul_t0, sz);
M_fmul_add(rr, fmul_t0, mii, sz);
M_fmul_div_conq(fmul_t0, fmul_a1, fmul_b1, mii);
mii = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();
M_push_mul_int(itmp);
M_push_mul_int(stmp);
M_push_mul_int(mii);
fmul_a9 = mul_stack_data[itmp+2];
fmul_b9 = mul_stack_data[itmp+5];
fmul_t0 = mul_stack_data[itmp+6];
M_fmul_add(rr, fmul_t0, 0, sz);
M_fmul_add(rr, fmul_t0, mii, sz);
if (stmp != 0)
M_fmul_div_conq(fmul_t0, fmul_a9, fmul_b9, mii);
mii = M_pop_mul_int();
stmp = M_pop_mul_int();
itmp = M_pop_mul_int();
fmul_t0 = mul_stack_data[itmp+6];
* if the sign of (A1 - A0)(B0 - B1) is positive, ADD to
* the result. if it is negative, SUBTRACT from the result.
*/
if (stmp < 0)
{
fmul_a9 = mul_stack_data[itmp+2];
fmul_b9 = mul_stack_data[itmp+5];
memset(fmul_b9, 0, (2 * sz));
memcpy((fmul_b9 + mii), fmul_t0, sz);
M_fmul_subtract(fmul_a9, rr, fmul_b9, (2 * sz));
memcpy(rr, fmul_a9, (2 * sz));
}
if (stmp > 0)
M_fmul_add(rr, fmul_t0, mii, sz);
M_mul_stack_ptr -= 7;
}
* special addition function for use with the fast multiply operation
*/
void M_fmul_add(UCHAR *r, UCHAR *a, int offset, int sz)
{
int i, j;
UCHAR carry;
carry = 0;
j = offset + sz;
i = sz;
while (TRUE)
{
r[--j] += carry + a[--i];
if (r[j] >= 100)
{
r[j] -= 100;
carry = 1;
}
else
carry = 0;
if (i == 0)
break;
}
if (carry)
{
while (TRUE)
{
r[--j] += 1;
if (r[j] < 100)
break;
r[j] -= 100;
}
}
}
* special subtraction function for use with the fast multiply operation
*/
int M_fmul_subtract(UCHAR *r, UCHAR *a, UCHAR *b, int sz)
{
int k, jtmp, sflag, nb, borrow;
nb = sz;
sflag = 0;
* find if a > b (so we perform a-b)
* or a < b (so we perform b-a)
*/
for (k=0; k < nb; k++)
{
if (a[k] < b[k])
{
sflag = -1;
break;
}
if (a[k] > b[k])
{
sflag = 1;
break;
}
}
if (sflag == 0)
{
memset(r, 0, nb);
}
else
{
k = nb;
borrow = 0;
while (TRUE)
{
k--;
if (sflag == 1)
jtmp = (int)a[k] - ((int)b[k] + borrow);
else
jtmp = (int)b[k] - ((int)a[k] + borrow);
if (jtmp >= 0)
{
r[k] = (UCHAR)jtmp;
borrow = 0;
}
else
{
r[k] = (UCHAR)(100 + jtmp);
borrow = 1;
}
if (k == 0)
break;
}
}
return(sflag);
}
int M_next_power_of_2(int n)
{
int ct, k;
if (n <= 2)
return(n);
k = 2;
ct = 0;
while (TRUE)
{
if (k >= n)
break;
k = k << 1;
if (++ct == bit_limit)
{
M_apm_log_error_msg(M_APM_FATAL,
"\'M_next_power_of_2\', ERROR :sizeof(int) too small ??");
}
}
return(k);
}
int M_get_stack_ptr(int sz)
{
int i, k;
UCHAR *cp;
k = ++M_mul_stack_ptr;
if (mul_stack_data_size[k] == 0)
{
if ((i = sz) < 16)
i = 16;
if ((cp = (UCHAR *)MAPM_MALLOC(i + 4)) == NULL)
{
M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
}
mul_stack_data[k] = cp;
mul_stack_data_size[k] = i;
}
else
{
if (sz > mul_stack_data_size[k])
{
cp = mul_stack_data[k];
if ((cp = (UCHAR *)MAPM_REALLOC(cp, (sz + 4))) == NULL)
{
M_apm_log_error_msg(M_APM_FATAL, M_stack_ptr_error_msg);
}
mul_stack_data[k] = cp;
mul_stack_data_size[k] = sz;
}
}
return(k);
}
#ifdef NO_FFT_MULTIPLY
* multiply a 4 byte number by a 4 byte number
* yielding an 8 byte result. each byte contains
* a base 100 'digit', i.e.: range from 0-99.
*
* MSB LSB
*
* a,b [0] [1] [2] [3]
* result [0] ..... [7]
*/
void M_4_byte_multiply(UCHAR *r, UCHAR *a, UCHAR *b)
{
int jj;
unsigned int *ip, t1, rr[8];
memset(rr, 0, (8 * sizeof(int)));
jj = 3;
ip = rr + 5;
* loop for one number [b], un-roll the inner 'loop' [a]
*
* accumulate partial sums in UINT array, release carries
* and convert back to base 100 at the end
*/
while (1)
{
t1 = (unsigned int)b[jj];
ip += 2;
*ip-- += t1 * a[3];
*ip-- += t1 * a[2];
*ip-- += t1 * a[1];
*ip += t1 * a[0];
if (jj-- == 0)
break;
}
jj = 7;
while (1)
{
t1 = rr[jj] / 100;
r[jj] = (UCHAR)(rr[jj] - 100 * t1);
if (jj == 0)
break;
rr[--jj] += t1;
}
}
#endif