-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFloat_Functions.h
231 lines (204 loc) · 5.38 KB
/
Float_Functions.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#include "myFloat.h"
int counter = 0;
int least_precision = 7;
Float add(Float, Float);
Float multiply(Float, Float);
Float zero();
Float new_float(float f, int decprecision);
void printinfo(Float f);
void epsToPositive(Float *f);
long double valid2toabs(float f, int valid);
int validabsto2(float val, long double eps);
int valid2to10(int bin);
int valid10to2(int dec);
long double valid2toabs(float f, int valid)
{
long double ld = f;
long addr = (long)(&ld);
*(long*)addr = 0x8000000000000000; //set Frac 0
addr += 8;
long Exp = (*(long*)(addr)) & 0x7FFF;
if(Exp >= valid)
Exp -= valid;
else
Exp = 0;
*(long*)addr = (*(long*)addr) & 0x8000;
*(long*)addr = (*(long*)addr) | Exp; //set Exp - 23
ld /= 2;
if (ld < 0)
ld = -ld;
return ld;
}
int validabsto2(float val, long double eps)
{
long double ldval = val;
long addr = (long)(&ldval);
addr += 8;
long valExp = (*(long*)(addr)) & 0x7FFF;
addr = (long)(&eps);
addr += 8;
long epsExp = (*(long*)(addr)) & 0x7FFF;
int diff = valExp - epsExp - 1;
return (diff > 0)?diff:0;
}
int valid2to10(int bin)
{
if (bin >= 32)
{
printf("Not support.\n");
return 10;
}
int vec[32] = {0,1,1,1,2,2,2,3,3,3,4,4,4,4,5,5,5,6,6,6,7,7,7,7,8,8,8,9,9,9,10,10};
return vec[bin];
}
int valid10to2(int dec)
{
if (dec < 0)
return 23;
if (dec >= 10)
{
printf("Not support.\n");
return 23;
}
int vec[10] = {0,1,4,7,10,14,18,20,24,27};
return vec[dec];
}
Float add(Float a, Float b){
Float result = zero();
int i, j, k;
// Calculate with high precision
long double new_val = (long double)a.val + (long double)b.val;
// Calculate in float
float float_val = a.val + b.val;
// New epsilon from above calculation
long double new_eps = new_val - float_val;
if (new_eps < 0) new_eps = -new_eps;
// For each possible eps
for( i = 0; i < counter; i++){
result.eps[i] = a.eps[i] + b.eps[i];
}
// For new epsilon
result.eps[counter++] = new_eps;
// For epshi
result.epshi = a.epshi + b.epshi;
// For val
result.val = float_val;
epsToPositive(&result);
result.valid_bit = a.valid_bit < b.valid_bit? a.valid_bit : b.valid_bit;
return result;
}
Float sub(Float a, Float b){
b.val = -b.val;
return add(a, b);
}
Float new_float(float f, int decprecision){
Float result = zero();
long double new_eps = 0.0;
// if (given_eps >= -0.0000000000001 && given_eps <= 0.0000000000001)
// new_eps = valid2toabs(f);
// else
// new_eps = given_eps;
result.valid_bit = decprecision < 0 ? 7 : decprecision;
new_eps = valid2toabs(f, valid10to2(decprecision));
//printf("debug: %Le\n", new_eps);
// Set value
result.val = f;
// Set initial epsilon
result.eps[counter++] = new_eps;
// No high order epsilon
result.epshi = 0;
return result;
}
Float multiply(Float a, Float b){
int i, j, k;
Float result = zero();
long double new_eps = 0;
long double new_epshi = 0;
// Calculate the new epsilon
long double new_val = (long double)a.val * (long double)b.val;
float float_val = a.val * b.val;
new_eps = new_val - float_val;
if (new_eps < 0) new_eps = -new_eps;
// Calculate eps
for( i = 0; i < MAX; i++){
result.eps[i] = a.val * b.eps[i] + b.val * a.eps[i];
}
// Calculate epshi
for(i = 0; i < MAX; i++){
for (j = 0; j < MAX; j++){
new_epshi += a.eps[i] * b.eps[j];
}
}
for (i = 0; i < MAX; i++){
new_epshi += a.epshi * b.eps[i];
}
new_epshi += a.epshi * b.val;
new_epshi += a.epshi * b.epshi;
for (i = 0; i < MAX; i++){
new_epshi += b.epshi * a.eps[i];
}
new_epshi += b.epshi * a.val;
result.epshi = new_epshi;
// Add new epsilon
result.eps[counter++] = new_eps;
// For val
result.val = float_val;
epsToPositive(&result);
result.valid_bit = a.valid_bit < b.valid_bit? a.valid_bit : b.valid_bit;
return result;
}
Float zero(){
Float result;
int i;
result.val = 0;
result.epshi = 0;
for( i = 0; i < MAX; i++){
result.eps[i] = 0;
}
return result;
}
void printinfo(Float f)
{
int i;
int validbit;
printf("\nvalue: %e\n", f.val);
for (i = 0; i < counter; i++)
{
printf("eps[%d]: %Le\n", i, f.eps[i]);
}
printf("epshi: %Le\n", f.epshi);
long double sumerror = f.epshi;
for (i = 0; i < counter; i++)
{
sumerror += f.eps[i];
}
printf("Predicted Max Absolute Error: %Le\n", sumerror);
if (f.val <= 1e-8 && f.val >= -1e-8)
printf("Predicted Max Relative Error: NaN\n");
else
printf("Predicted Max Relative Error: %Le\n", sumerror/f.val >= 0? sumerror/f.val:-sumerror/f.val);
validbit = valid2to10(validabsto2(f.val, sumerror)) < f.valid_bit? valid2to10(validabsto2(f.val, sumerror)) : f.valid_bit;
printf("Valid bits in dec: %d\n", validbit);
if (validbit < least_precision)
{
printf("Precision not Satisfied!\n");
exit(-1);
}
else
return;
}
void epsToPositive(Float *f)
{
int i = 0;
for (i = 0; i < counter; i++)
{
if (f->eps[i] < 0)
{
f->eps[i] = -f->eps[i];
}
}
if (f->epshi < 0)
{
f->epshi = -(f->epshi);
}
}