looking through the latest libnds ive just found new defines for a maths coprossesor they look like this.
now im not sure how a maths coprossesor works but my thinking is that you can mabey do one calc on the coprossesor and one on the cpu thus doing two at once
and this might be good for like perspective correct mapping as it would enable you to smuther out the extra divide per pixel either that or its just a supper fast way of doing divides and stuff.
or am i just beeing stupid?
#ifndef MATH_ARM9_INCLUDE#define MATH_ARM9_INCLUDE#include <nds/jtypes.h>// Math coprocessor register definitions#define DIV_CR                                                (*(vuint16*)(0x04000280))#define DIV_NUMERATOR64                (*(vint64*) (0x04000290))#define DIV_NUMERATOR32                (*(vint32*) (0x04000290))#define DIV_DENOMINATOR64        (*(vint64*) (0x04000298))#define DIV_DENOMINATOR32        (*(vint32*) (0x04000298))#define DIV_RESULT64                        (*(vint64*) (0x040002A0))#define DIV_RESULT32                        (*(vint32*) (0x040002A0))#define DIV_REMANDER64                (*(vint64*) (0x040002A8))#define DIV_REMANDER32                (*(vint32*) (0x040002A8))#define SQRT_CR                                                (*(vuint16*)(0x040002B0))#define SQRT_PARAM64                        (*(vint64*) (0x040002B8))#define SQRT_RESULT32                        (*(vint32*) (0x040002B4))#define SQRT_PARAM32                        (*(vint32*) (0x040002B8))// Math coprocessor modes#define DIV_64_64                        2#define DIV_64_32                        1#define DIV_32_32                        0#define DIV_BUSY                        (1<<15)#define SQRT_64                                1#define SQRT_32                                0#define SQRT_BUSY                        (1<<15)// Fixed Point versions// Fixed point divide// Takes 1.19.12 numerator and denominator// and returns 1.19.12 resultstatic inline f32 divf32(f32 num, f32 den){        DIV_CR = DIV_64_32;        while(DIV_CR & DIV_BUSY);        DIV_NUMERATOR64 = ((int64)num) << 12;        DIV_DENOMINATOR32 = den;        while(DIV_CR & DIV_BUSY);        return (DIV_RESULT32);}// Fixed point multiply//        Takes 1.19.12 values and returns//        1.19.12 resultstatic inline f32 mulf32(f32 a, f32 b){        long long result = (long long)a*(long long)b;        return (f32)(result >> 12);}// Fixed point square root//        Takes 1.19.12 fixed point value and//        returns the fixed point resultstatic inline f32 sqrtf32(f32 a){        SQRT_CR = SQRT_64;        while(SQRT_CR & SQRT_BUSY);        SQRT_PARAM64 = ((int64)a) << 12;        while(SQRT_CR & SQRT_BUSY);        return SQRT_RESULT32;}// Integer versions// Integer divide// Takes a 32 bit numerator and 32 bit//        denominator and returns 32 bit resultstatic inline int32 div32(int32 num, int32 den){        DIV_CR = DIV_32_32;        while(DIV_CR & DIV_BUSY);        DIV_NUMERATOR32 = num;        DIV_DENOMINATOR32 = den;        while(DIV_CR & DIV_BUSY);        return (DIV_RESULT32);}// Integer divide// Takes a 32 bit numerator and 32 bit//        denominator and returns 32 bit resultstatic inline int32 mod32(int32 num, int32 den){        DIV_CR = DIV_32_32;        while(DIV_CR & DIV_BUSY);        DIV_NUMERATOR32 = num;        DIV_DENOMINATOR32 = den;        while(DIV_CR & DIV_BUSY);        return (DIV_REMANDER32);}// Integer divide//        Takes a 64 bit numerator and 32 bit// denominator are returns 32 bit resultstatic inline int32 div64(int64 num, int32 den){        DIV_CR = DIV_64_32;        while(DIV_CR & DIV_BUSY);        DIV_NUMERATOR64 = num;        DIV_DENOMINATOR32 = den;        while(DIV_CR & DIV_BUSY);        return (DIV_RESULT32);}// Integer divide//        Takes a 64 bit numerator and 32 bit// denominator are returns 32 bit resultstatic inline int32 mod64(int64 num, int32 den){        DIV_CR = DIV_64_32;        while(DIV_CR & DIV_BUSY);        DIV_NUMERATOR64 = num;        DIV_DENOMINATOR32 = den;        while(DIV_CR & DIV_BUSY);        return (DIV_REMANDER32);}// Integer square root// takes a 32 bit integer and returns//        32 bit resultstatic inline int32 sqrt32(int a){        SQRT_CR = SQRT_32;        while(SQRT_CR & SQRT_BUSY);        SQRT_PARAM32 = a;        while(SQRT_CR & SQRT_BUSY);        return SQRT_RESULT32;}// Trig Functions 1.19.12 fixed point// Cross product// x = Ay * Bz - By * Az// y = Az * Bx - Bz * Ax// z = Ax * By - Bx * Aystatic inline void crossf32(f32 *a, f32 *b, f32 *result){        result[0] = mulf32(a[1], b[2]) - mulf32(b[1], a[2]);        result[1] = mulf32(a[2], b[0]) - mulf32(b[2], a[0]);        result[2] = mulf32(a[0], b[1]) - mulf32(b[0], a[1]);}// Dot Product// result = Ax * Bx + Ay * By + Az * Bzstatic inline f32 dotf32(f32 *a, f32 *b){        return mulf32(a[0], b[0]) + mulf32(a[1], b[1]) + mulf32(a[2], b[2]);}// Normalize// Ax = Ax / mag// Ay = Ay / mag// Az = Az / magstatic inline void normalizef32(f32* a){        // magnitude = sqrt ( Ax^2 + Ay^2 + Az^2 )        f32 magnitude = sqrtf32( mulf32(a[0], a[0]) + mulf32(a[1], a[1]) + mulf32(a[2], a[2]) );        a[0] = divf32(a[0], magnitude);        a[1] = divf32(a[1], magnitude);        a[2] = divf32(a[2], magnitude);}#endifnow im not sure how a maths coprossesor works but my thinking is that you can mabey do one calc on the coprossesor and one on the cpu thus doing two at once
and this might be good for like perspective correct mapping as it would enable you to smuther out the extra divide per pixel either that or its just a supper fast way of doing divides and stuff.
or am i just beeing stupid?

