From a2fdd9be8cf252d3ea727457f4e309185baf8596 Mon Sep 17 00:00:00 2001 From: Rob Tillaart Date: Thu, 2 Dec 2021 20:23:05 +0100 Subject: [PATCH] add basic math, optimize compare operators (#5) * add basic math, optimize compare operators * fix negation * fix comparison --- README.md | 104 +++++------ examples/float16_test0/float16_test0.ino | 34 ++-- examples/float16_test1/float16_test1.ino | 5 +- .../float16_test_all/float16_test_all.ino | 4 +- .../float16_test_array/float16_test_array.ino | 5 +- .../float16_test_negative.ino | 2 +- .../float16_test_performance.ino | 89 +++++++++- .../float16_test_powers2.ino | 1 - .../float16_test_special.ino | 2 - float16.cpp | 168 +++++++++++------- float16.h | 65 ++++--- library.json | 2 +- library.properties | 2 +- 13 files changed, 299 insertions(+), 184 deletions(-) diff --git a/README.md b/README.md index 08ea288..4997418 100644 --- a/README.md +++ b/README.md @@ -21,22 +21,23 @@ The library implements the **Printable** interface so one can directly print the float16 values in any stream e.g. Serial. The primary usage of the float16 data type is to efficiently store and transport -a floating point number. As it is only 2 bytes where float and double have typical -4 and 8, gains can be made at the price of range and precision. +a floating point number. As it uses only 2 bytes where float and double have typical +4 and 8 bytes, gains can be made at the price of range and precision. ## Specifications -| attribute | value | notes | -|:----------|:-------------|:-------------| -| Size | 2 bytes | | -| sign | 1 bit | | -| mantissa | 11 bit | ~ 3 digits | -| exponent | 4 bit | | -| minimum | 1.0009765625 | 1 + 2^−10 | -| maximum | 65504 | | -| | | | +| attribute | value | notes | +|:----------|:-------------|:--------| +| size | 2 bytes | layout s eeeee mmmmmmmmmm +| sign | 1 bit | +| exponent | 5 bit | +| mantissa | 11 bit | ~ 3 digits +| minimum | 5.96046 E−8 | smallest positive number. +| | 1.0009765625 | 1 + 2^−10 = smallest nr larger than 1. +| maximum | 65504 | +| | | ## Interface @@ -52,72 +53,73 @@ to elaborate #### Conversion -- **double toDouble(void)** convert to double (or float) +- **double toDouble(void)** convert to double (or float). +- **uint16_t getBinary()** get the 2 byte binary representation. +- **void setBinary(uint16_t u)** set the 2 bytes binary representation. - **size_t printTo(Print& p) const** Printable interface. - **void setDecimals(uint8_t d)** idem, used for printTo. - **uint8_t getDecimals()** idem. -Note the setDecimals takes one byte per object which is not efficient for arrays. +Note the setDecimals takes one byte per object which is not efficient for arrays of float16. See array example for efficient storage using set/getBinary() functions. #### Compare -to elaborate - - -## Notes +Standard compare functions. Since 0.1.5 these are quite optimized, +so it is fast to compare e.g. 2 measurements. +- **bool operator == (const float16& f)** +- **bool operator != (const float16& f)** +- **bool operator > (const float16& f)** +- **bool operator >= (const float16& f)** +- **bool operator < (const float16& f)** +- **bool operator <= (const float16& f)** -#### comparison functions -First version of inequality operations are implemented by converting data to double and compare those. -The strategy is to get these working first and optionally optimize them later. +#### Math (basic) +Math is done by converting to double, do the math and convert back. +These operators are added for convenience only. +Not planned to optimize these. -## TODO (future) +- **float16 operator + (const float16& f)** +- **float16 operator - (const float16& f)** +- **float16 operator \* (const float16& f)** +- **float16 operator / (const float16& f)** +- **float16& operator += (const float16& f)** +- **float16& operator -= (const float16& f)** +- **float16& operator \*= (const float16& f)** +- **float16& operator /= (const float16& f)** -to get focus on getting things done... +negation operator. +- **float16 operator - ()** fast negation. +- **int sign()** returns 1 == positive, 0 == zero, -1 == negative. +- **bool isZero()** returns true if zero. slightly faster than **sign()**. +- **bool isInf()** returns true if value is (-)infinite. -#### 0.1.4 - -the following should work: - -- update documentation -- positive numbers -- negative numbers -- infinity -- rounding to zero (e.g. 1e-30) -- array of numbers. -- unit tests of the above.. +## Notes -#### 0.1.5 -- update documentation -- comparison operators -- unit tests of the above.. +## Future #### 0.1.6 -- update documentation -- get basic math working (+-*/) -- isNan() -- isINF() -- abs() -- sgn() -- unit tests of the above.. +- update documentation. +- unit tests of the above. +- isNan(). #### later -- update documentation -- get basic math II working += -= *= /= -- divide by zero errors. -- f16tof32() + f32tof16() -- rewrite toDouble with bit magic -- ... - +- update documentation. +- error handling. + - divide by zero errors. +- look for optimizations. +- rewrite **f16tof32()** with bit magic. +- add storage example - with SD card, FRAM or EEPROM +- add communication example - serial or Ethernet? diff --git a/examples/float16_test0/float16_test0.ino b/examples/float16_test0/float16_test0.ino index 920bcb2..eee4b47 100644 --- a/examples/float16_test0/float16_test0.ino +++ b/examples/float16_test0/float16_test0.ino @@ -6,27 +6,27 @@ // DATE: 2015-03-11 // URL: https://github.com/RobTillaart/float16 // -// Released to the public domain -// + /* -0 01111 0000000000 = 1 -0 01111 0000000001 = 1 + 2−10 = 1.0009765625 (next smallest float after 1) -1 10000 0000000000 = −2 + SIGN EXP MANTISSA + 0 01111 0000000000 = 1 + 0 01111 0000000001 = 1 + 2−10 = 1.0009765625 (next smallest float after 1) + 1 10000 0000000000 = −2 -0 11110 1111111111 = 65504 (max half precision) + 0 11110 1111111111 = 65504 (max half precision) -0 00001 0000000000 = 2−14 ≈ 6.10352 × 10−5 (minimum positive normal) -0 00000 1111111111 = 2−14 - 2−24 ≈ 6.09756 × 10−5 (maximum subnormal) -0 00000 0000000001 = 2−24 ≈ 5.96046 × 10−8 (minimum positive subnormal) + 0 00001 0000000000 = 2−14 ≈ 6.10352 × 10−5 (minimum positive normal) + 0 00000 1111111111 = 2−14 - 2−24 ≈ 6.09756 × 10−5 (maximum subnormal) + 0 00000 0000000001 = 2−24 ≈ 5.96046 × 10−8 (minimum positive subnormal) -0 00000 0000000000 = 0 -1 00000 0000000000 = −0 + 0 00000 0000000000 = 0 + 1 00000 0000000000 = −0 -0 11111 0000000000 = infinity -1 11111 0000000000 = −infinity + 0 11111 0000000000 = infinity + 1 11111 0000000000 = −infinity -0 01101 0101010101 = 0.333251953125 ≈ 1/3 + 0 01101 0101010101 = 0.333251953125 ≈ 1/3 */ #include "float16.h" @@ -62,8 +62,8 @@ void test_constructors() Serial.println("\ntest_constructors:"); float16 a; Serial.println(a.toDouble(), 9); - Serial.println(a.getBinary(), HEX); - + Serial.println(a.getBinary(), HEX); + float16 b = 6; Serial.println(b.toDouble(), 9); Serial.println(b.getBinary(), HEX); @@ -144,7 +144,7 @@ void test_numbers() Serial.println("** OVERFLOW **"); float16 f(1000000.0); Serial.println(f.toDouble(), 9); - + Serial.println("** UNDERFLOW **"); float16 g(1 / 1000000.0); Serial.println(g.toDouble(), 9); diff --git a/examples/float16_test1/float16_test1.ino b/examples/float16_test1/float16_test1.ino index 18b7bb8..6ad2564 100644 --- a/examples/float16_test1/float16_test1.ino +++ b/examples/float16_test1/float16_test1.ino @@ -7,6 +7,7 @@ // URL: https://github.com/RobTillaart/float16 // + #include "float16.h" float16 X; @@ -19,11 +20,11 @@ void setup() Serial.println(__FILE__); Serial.print("FLOAT16_LIB_VERSION: "); Serial.println(FLOAT16_LIB_VERSION); - Serial.println("\nStart "); float f; - for (uint16_t n = 0; n < 65535; n++) + // dump all possible values + for (uint16_t n = 0; n < 65535; n++) { f = X.f16tof32(n); Serial.print(n); diff --git a/examples/float16_test_all/float16_test_all.ino b/examples/float16_test_all/float16_test_all.ino index db2f9c3..3ea300f 100644 --- a/examples/float16_test_all/float16_test_all.ino +++ b/examples/float16_test_all/float16_test_all.ino @@ -77,7 +77,7 @@ void test_1() Serial.print('\t'); float current = f16.toDouble(); Serial.print(current, 8); - if (prev > current) + if (prev > current) // numbers should be increasing. { Serial.print("\t\tERROR"); errors++; @@ -107,7 +107,7 @@ void test_1() Serial.print('\t'); float current = f16.toDouble(); Serial.print(current, 8); - if (prev < current) + if (prev < current) // negative numbers should be decreasing. { Serial.print("\t\tERROR"); errors++; diff --git a/examples/float16_test_array/float16_test_array.ino b/examples/float16_test_array/float16_test_array.ino index 5162415..1e2c1d7 100644 --- a/examples/float16_test_array/float16_test_array.ino +++ b/examples/float16_test_array/float16_test_array.ino @@ -22,9 +22,10 @@ void setup() Serial.println(__FILE__); Serial.print("FLOAT16_LIB_VERSION: "); Serial.println(FLOAT16_LIB_VERSION); - Serial.println("\nStart "); + Serial.println(); + - // simulate temperature with random numbers + // simulate temperature sensor with random numbers for (uint32_t n = 0; n < 10; n++) { temperature[n] = (random(1000) - 300) * 0.01; diff --git a/examples/float16_test_negative/float16_test_negative.ino b/examples/float16_test_negative/float16_test_negative.ino index 0063335..444af2a 100644 --- a/examples/float16_test_negative/float16_test_negative.ino +++ b/examples/float16_test_negative/float16_test_negative.ino @@ -17,7 +17,7 @@ void setup() Serial.println(__FILE__); Serial.print("FLOAT16_LIB_VERSION: "); Serial.println(FLOAT16_LIB_VERSION); - Serial.println("\nStart "); + Serial.println(); for( int i = -10; i < 2; i++) diff --git a/examples/float16_test_performance/float16_test_performance.ino b/examples/float16_test_performance/float16_test_performance.ino index 116353f..f004d3c 100644 --- a/examples/float16_test_performance/float16_test_performance.ino +++ b/examples/float16_test_performance/float16_test_performance.ino @@ -24,9 +24,9 @@ void setup() Serial.println(FLOAT16_LIB_VERSION); Serial.println(); - f = random(1000000) * 0.001; - // CONSTRUCTORS + Serial.println("CONSTRUCTORS"); + f = random(1000000) * 0.001; start = micros(); float16 f16(f); stop = micros(); @@ -41,8 +41,10 @@ void setup() Serial.print("a = b: \t"); Serial.println(stop - start); delay(10); + Serial.println(); + - // CONVERSION + Serial.println("CONVERSION"); start = micros(); f = f16.toDouble(); stop = micros(); @@ -52,7 +54,7 @@ void setup() Serial.println(); - // COMPARE + Serial.println("COMPARE"); f17 = f16.toDouble() + 1; start = micros(); @@ -96,6 +98,85 @@ void setup() Serial.print("compare > : \t"); Serial.println(stop - start); delay(10); + Serial.println(); + + + Serial.println("MATH I"); + float16 f18; + start = micros(); + f18 = f16 + f17; + stop = micros(); + Serial.print("math + : \t"); + Serial.println(stop - start); + delay(10); + // Serial.println(f16); + // Serial.println(f17); + // Serial.println(f18); + + start = micros(); + f18 = f16 - f17; + stop = micros(); + Serial.print("math - : \t"); + Serial.println(stop - start); + delay(10); + + start = micros(); + f18 = f16 * f17; + stop = micros(); + Serial.print("math * : \t"); + Serial.println(stop - start); + delay(10); + + start = micros(); + f18 = f16 + f17; + stop = micros(); + Serial.print("math / : \t"); + Serial.println(stop - start); + delay(10); + Serial.println(); + + Serial.println("MATH II"); + start = micros(); + f18 += f16; + stop = micros(); + Serial.print("math += : \t"); + Serial.println(stop - start); + delay(10); + + start = micros(); + f18 -= f16; + stop = micros(); + Serial.print("math -= : \t"); + Serial.println(stop - start); + delay(10); + + start = micros(); + f18 *= f16; + stop = micros(); + Serial.print("math *= : \t"); + Serial.println(stop - start); + delay(10); + + start = micros(); + f18 /= f16; + stop = micros(); + Serial.print("math /= : \t"); + Serial.println(stop - start); + delay(10); + Serial.println(); + + Serial.println(f16); + + Serial.println("MATH III - negation"); + start = micros(); + f18 = -f16; + stop = micros(); + Serial.print("negation : \t"); + Serial.println(stop - start); + delay(10); + Serial.println(); + + Serial.println(f18); Serial.println("\ndone"); } diff --git a/examples/float16_test_powers2/float16_test_powers2.ino b/examples/float16_test_powers2/float16_test_powers2.ino index 76146f7..59d4ff5 100644 --- a/examples/float16_test_powers2/float16_test_powers2.ino +++ b/examples/float16_test_powers2/float16_test_powers2.ino @@ -18,7 +18,6 @@ void setup() Serial.println(__FILE__); Serial.print("FLOAT16_LIB_VERSION: "); Serial.println(FLOAT16_LIB_VERSION); - Serial.println("\nStart "); for (uint32_t n = 1; n < 65536; n *= 2) { diff --git a/examples/float16_test_special/float16_test_special.ino b/examples/float16_test_special/float16_test_special.ino index f768fe4..5d1691c 100644 --- a/examples/float16_test_special/float16_test_special.ino +++ b/examples/float16_test_special/float16_test_special.ino @@ -31,7 +31,6 @@ void setup() Serial.println(__FILE__); Serial.print("FLOAT16_LIB_VERSION: "); Serial.println(FLOAT16_LIB_VERSION); - Serial.println("\nStart "); f16.setDecimals(6); @@ -46,7 +45,6 @@ void setup() Serial.print("\t"); Serial.println(); } - Serial.println(); Serial.println(); diff --git a/float16.cpp b/float16.cpp index cf65e98..1bcd278 100644 --- a/float16.cpp +++ b/float16.cpp @@ -22,146 +22,191 @@ // CONSTRUCTOR float16::float16(double f) { - n = f32tof16(f); + _value = f32tof16(f); } // PRINTING size_t float16::printTo(Print& p) const { - double d = this->f16tof32(n); - return p.print(d, _decimals); + double d = this->f16tof32(_value); + return p.print(d, _decimals); }; double float16::toDouble() const { - return f16tof32(n); + return f16tof32(_value); } -// NEGATE -float16 float16::operator - () +////////////////////////////////////////////////////////// +// +// EQUALITIES +// +bool float16::operator == (const float16 &f) { - return float16( -f16tof32(n) ); + return (_value == f._value); } -// bool float16::isNaN(); -// bool float16::isInf(); - -// EQUALITIES -bool float16::operator == (const float16 &f) +bool float16::operator != (const float16 &f) { - return (n == f.n); + return (_value != f._value); } +bool float16::operator > (const float16 &f) +{ + if ((_value & 0x8000) && ( f._value & 0x8000)) return _value < f._value; + if (_value & 0x8000) return false; + if (f._value & 0x8000) return true; + return _value > f._value; +} -bool float16::operator != (const float16 &f) +bool float16::operator >= (const float16 &f) { - return (n != f.n); + if ((_value & 0x8000) && (f._value & 0x8000)) return _value <= f._value; + if (_value & 0x8000) return false; + if (f._value & 0x8000) return true; + return _value >= f._value; } +bool float16::operator < (const float16 &f) +{ + if ((_value & 0x8000) && (f._value & 0x8000)) return _value > f._value; + if (_value & 0x8000) return true; + if (f._value & 0x8000) return false; + return _value < f._value; +} -bool float16::operator > (const float16 &c) +bool float16::operator <= (const float16 &f) { - return this->toDouble() > c.toDouble(); + if ((_value & 0x8000) && (f._value & 0x8000)) return _value >= f._value; + if (_value & 0x8000) return true; + if (f._value & 0x8000) return false; + return _value <= f._value; } -bool float16::operator >= (const float16 &c) +////////////////////////////////////////////////////////// +// +// NEGATION +// +float16 float16::operator - () { - return this->toDouble() >= c.toDouble(); + float16 f16; + f16.setBinary(_value ^ 0x8000); + return f16; } -bool float16::operator < (const float16 &c) +////////////////////////////////////////////////////////// +// +// MATH +// +float16 float16::operator + (const float16 &f) { - return this->toDouble() < c.toDouble(); + return float16(this->toDouble() + f.toDouble()); } -bool float16::operator <= (const float16 &c) +float16 float16::operator - (const float16 &f) { - return this->toDouble() <= c.toDouble(); + return float16(this->toDouble() - f.toDouble()); } -/* -// BASIC MATH I -float16 float16::operator + (const float16 &c) +float16 float16::operator * (const float16 &f) { - return (float16(this->toDouble() + c.toDouble()); + return float16(this->toDouble() * f.toDouble()); } -float16 float16::operator - (const float16 &c) + +float16 float16::operator / (const float16 &f) { - return (float16(this->toDouble() - c.toDouble()); + return float16(this->toDouble() / f.toDouble()); } -float16 float16::operator * (const float16 &c) + +float16& float16::operator += (const float16 &f) { - return (float16(this->toDouble() * c.toDouble()); + *this = this->toDouble() + f.toDouble(); + return *this; } -float16 float16::operator / (const float16 &c) + +float16& float16::operator -= (const float16 &f) { - return (float16(this->toDouble() / c.toDouble()); + *this = this->toDouble() - f.toDouble(); + return *this; } -*/ -/* -// BASIC MATH II -float16& float16::operator += (const float16 &c) +float16& float16::operator *= (const float16 &f) { + *this = this->toDouble() * f.toDouble(); + return *this; } -float16& float16::operator -= (const float16 &c) + +float16& float16::operator /= (const float16 &f) { + *this = this->toDouble() / f.toDouble(); + return *this; } -float16& float16::operator *= (const float16 &c) + +////////////////////////////////////////////////////////// +// +// MATH HELPER FUNCTIONS +// + +int float16::sign() { + if (_value & 0x8000) return -1; + if (_value & 0xFFFF) return 1; + return 0; } -float16& float16::operator /= (const float16 &c) + +bool float16::isZero() { + return ((_value & 0x7FFF) == 0x0000); } -*/ +// bool float16::isNaN() +// { + // return ((_value & 0x7FFF) == 0x0000); +// } + +bool float16::isInf() +{ + return ((_value == 0x7C00) || (_value == 0xFC00)); +} -float float16::f16tof32(uint16_t n) const +////////////////////////////////////////////////////////// +// +// CORE CONVERSION +// +float float16::f16tof32(uint16_t _value) const { uint16_t sgn, man; int exp; double f; - sgn = (n & 0x8000) > 0; - exp = (n & 0x7C00) >> 10; - man = (n & 0x03FF); - -#ifdef DEBUG - Serial.println(sgn, BIN); - Serial.println(exp, BIN); - Serial.println(man, BIN); -#endif + sgn = (_value & 0x8000) > 0; + exp = (_value & 0x7C00) >> 10; + man = (_value & 0x03FF); // ZERO - if ((n & 0x7FFF) == 0) + if ((_value & 0x7FFF) == 0) { -#ifdef DEBUG - Serial.println("ZERO"); -#endif return sgn ? -0 : 0; } // NAN & INF if (exp == 0x001F) { -#ifdef DEBUG - Serial.println("INFINITY"); -#endif if (man == 0) return sgn ? -INFINITY : INFINITY; else return NAN; } @@ -193,10 +238,6 @@ uint16_t float16::f32tof16(float f) const int16_t exp = (t & 0x7F800000) >> 23; bool sgn = (t & 0x80000000); - // Serial.print("SGN: "); Serial.println(sgn, BIN); - // Serial.print("EXP: "); Serial.println(exp, BIN); - // Serial.print("MAN: "); Serial.println(man, BIN); - // handle 0 if ((t & 0x7FFFFFFF) == 0) { @@ -241,9 +282,6 @@ uint16_t float16::f32tof16(float f) const exp <<= 10; man++; man >>= 1; - // Serial.print("SGN: "); Serial.println(sgn, BIN); - // Serial.print("EXP: "); Serial.println(exp, BIN); - // Serial.print("MAN: "); Serial.println(man, BIN); if (sgn) return 0x8000 | exp | man; return exp | man; } diff --git a/float16.h b/float16.h index 0b73981..f88b92c 100644 --- a/float16.h +++ b/float16.h @@ -2,7 +2,7 @@ // // FILE: float16.h // AUTHOR: Rob Tillaart -// VERSION: 0.1.4 +// VERSION: 0.1.5 // PURPOSE: Arduino library to implement float16 data type. // half-precision floating point format, // used for efficient storage and transport. @@ -12,61 +12,59 @@ #include "Arduino.h" -#define FLOAT16_LIB_VERSION "0.1.4" +#define FLOAT16_LIB_VERSION (F("0.1.5")) class float16: public Printable { public: // Constructors - float16(void) { n = 0; }; + float16(void) { _value = 0x0000; }; float16(double f); - float16(const float16 &f) { n = f.n; }; + float16(const float16 &f) { _value = f._value; }; // Conversion double toDouble(void) const; // access the 2 byte representation. - uint16_t getBinary() { return n; }; - void setBinary(uint16_t u) { n = u; }; + uint16_t getBinary() { return _value; }; + void setBinary(uint16_t u) { _value = u; }; // Printable size_t printTo(Print& p) const; void setDecimals(uint8_t d) { _decimals = d; }; uint8_t getDecimals() { return _decimals; }; + // equalities + bool operator == (const float16& f); + bool operator != (const float16& f); - -// bool isNaN(); -// bool isInf(); - + bool operator > (const float16& f); + bool operator >= (const float16& f); + bool operator < (const float16& f); + bool operator <= (const float16& f); // negation float16 operator - (); - - // equalities - bool operator == (const float16&); - bool operator != (const float16&); - - bool operator > (const float16&); - bool operator >= (const float16&); - bool operator < (const float16&); - bool operator <= (const float16&); - /* // basic math - float16 operator + (const float16&); - float16 operator - (const float16&); - float16 operator * (const float16&); - float16 operator / (const float16&); - - float16& operator += (const float16&); - float16& operator -= (const float16&); - float16& operator *= (const float16&); - float16& operator /= (const float16&); - */ + float16 operator + (const float16& f); + float16 operator - (const float16& f); + float16 operator * (const float16& f); + float16 operator / (const float16& f); + + float16& operator += (const float16& f); + float16& operator -= (const float16& f); + float16& operator *= (const float16& f); + float16& operator /= (const float16& f); + + // math helper functions + int sign(); // 1 = positive 0 = zero -1 = negative. + bool isZero(); +// bool isNaN(); + bool isInf(); - // DEBUGGING + // CORE CONVERSION // should be private but for testing... float f16tof32(uint16_t) const; uint16_t f32tof16(float) const; @@ -74,10 +72,7 @@ class float16: public Printable private: uint8_t _decimals = 4; - // TODO - // n is not descriptive, - // should be _n at least; - uint16_t n; + uint16_t _value; }; diff --git a/library.json b/library.json index 7e29f2c..553f0d1 100644 --- a/library.json +++ b/library.json @@ -15,7 +15,7 @@ "type": "git", "url": "https://github.com/RobTillaart/float16.git" }, - "version": "0.1.4", + "version": "0.1.5", "license": "MIT", "frameworks": "arduino", "platforms": "*", diff --git a/library.properties b/library.properties index 197938f..6b277b2 100644 --- a/library.properties +++ b/library.properties @@ -1,5 +1,5 @@ name=float16 -version=0.1.4 +version=0.1.5 author=Rob Tillaart maintainer=Rob Tillaart sentence=Arduino library to implement float16 data type.