M4RI 20250128
xor_template.h
1#include <m4ri/m4ri_config.h>
2#include <m4ri/misc.h>
3
12static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(word *m, word const *t[N], wi_t wide) {
13 assert(1 <= N && N <= 8);
14
15#if __M4RI_HAVE_SSE2
16
17 assert((__M4RI_ALIGNMENT(m, 16) == 8) | (__M4RI_ALIGNMENT(m, 16) == 0));
18
19 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
20 case 8: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[7], 16));
21 case 7: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[6], 16));
22 case 6: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[5], 16));
23 case 5: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[4], 16));
24 case 4: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[3], 16));
25 case 3: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[2], 16));
26 case 2: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[1], 16));
27 case 1: assert(__M4RI_ALIGNMENT(m, 16) == __M4RI_ALIGNMENT(t[0], 16));
28 };
29
30 if (__M4RI_UNLIKELY(__M4RI_ALIGNMENT(m, 16) == 8)) {
31 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
32 case 8:
33 *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
34 break;
35 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
36 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
37 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
38 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
39 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
40 case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
41 case 1: *m++ ^= *t[0]++; break;
42 };
43 wide--;
44 }
45
46 __m128i *m__ = (__m128i *)m;
47 __m128i *t__[N];
48
49 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
50 case 8: t__[N - 8] = (__m128i *)t[N - 8];
51 case 7: t__[N - 7] = (__m128i *)t[N - 7];
52 case 6: t__[N - 6] = (__m128i *)t[N - 6];
53 case 5: t__[N - 5] = (__m128i *)t[N - 5];
54 case 4: t__[N - 4] = (__m128i *)t[N - 4];
55 case 3: t__[N - 3] = (__m128i *)t[N - 3];
56 case 2: t__[N - 2] = (__m128i *)t[N - 2];
57 case 1: t__[N - 1] = (__m128i *)t[N - 1];
58 };
59
60 __m128i xmm0, xmm1, xmm2, xmm3;
61
62 wi_t i = 0;
63 for (; i + 4 <= (wide >> 1); i += 4) {
64 xmm0 = m__[0];
65 xmm1 = m__[1];
66 xmm2 = m__[2];
67 xmm3 = m__[3];
68 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
69 case 8:
70 xmm0 = _mm_xor_si128(xmm0, t__[7][0]);
71 xmm1 = _mm_xor_si128(xmm1, t__[7][1]);
72 xmm2 = _mm_xor_si128(xmm2, t__[7][2]);
73 xmm3 = _mm_xor_si128(xmm3, t__[7][3]);
74 t__[7] += 4;
75 case 7:
76 xmm0 = _mm_xor_si128(xmm0, t__[6][0]);
77 xmm1 = _mm_xor_si128(xmm1, t__[6][1]);
78 xmm2 = _mm_xor_si128(xmm2, t__[6][2]);
79 xmm3 = _mm_xor_si128(xmm3, t__[6][3]);
80 t__[6] += 4;
81 case 6:
82 xmm0 = _mm_xor_si128(xmm0, t__[5][0]);
83 xmm1 = _mm_xor_si128(xmm1, t__[5][1]);
84 xmm2 = _mm_xor_si128(xmm2, t__[5][2]);
85 xmm3 = _mm_xor_si128(xmm3, t__[5][3]);
86 t__[5] += 4;
87 case 5:
88 xmm0 = _mm_xor_si128(xmm0, t__[4][0]);
89 xmm1 = _mm_xor_si128(xmm1, t__[4][1]);
90 xmm2 = _mm_xor_si128(xmm2, t__[4][2]);
91 xmm3 = _mm_xor_si128(xmm3, t__[4][3]);
92 t__[4] += 4;
93 case 4:
94 xmm0 = _mm_xor_si128(xmm0, t__[3][0]);
95 xmm1 = _mm_xor_si128(xmm1, t__[3][1]);
96 xmm2 = _mm_xor_si128(xmm2, t__[3][2]);
97 xmm3 = _mm_xor_si128(xmm3, t__[3][3]);
98 t__[3] += 4;
99 case 3:
100 xmm0 = _mm_xor_si128(xmm0, t__[2][0]);
101 xmm1 = _mm_xor_si128(xmm1, t__[2][1]);
102 xmm2 = _mm_xor_si128(xmm2, t__[2][2]);
103 xmm3 = _mm_xor_si128(xmm3, t__[2][3]);
104 t__[2] += 4;
105 case 2:
106 xmm0 = _mm_xor_si128(xmm0, t__[1][0]);
107 xmm1 = _mm_xor_si128(xmm1, t__[1][1]);
108 xmm2 = _mm_xor_si128(xmm2, t__[1][2]);
109 xmm3 = _mm_xor_si128(xmm3, t__[1][3]);
110 t__[1] += 4;
111 case 1:
112 xmm0 = _mm_xor_si128(xmm0, t__[0][0]);
113 xmm1 = _mm_xor_si128(xmm1, t__[0][1]);
114 xmm2 = _mm_xor_si128(xmm2, t__[0][2]);
115 xmm3 = _mm_xor_si128(xmm3, t__[0][3]);
116 t__[0] += 4;
117 }
118 m__[0] = xmm0;
119 m__[1] = xmm1;
120 m__[2] = xmm2;
121 m__[3] = xmm3;
122 m__ += 4;
123 }
124
125 for (; i < (wide >> 1); i++) {
126 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
127 case 8:
128 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
129 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
130 xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++);
131 xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
132 xmm0 = _mm_xor_si128(xmm0, xmm1);
133 xmm2 = _mm_xor_si128(xmm2, xmm3);
134 xmm0 = _mm_xor_si128(xmm0, xmm2);
135 xmm0 = _mm_xor_si128(*m__, xmm0);
136 break;
137 case 7:
138 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
139 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
140 xmm0 = _mm_xor_si128(xmm0, *t__[4]++);
141 xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
142 xmm0 = _mm_xor_si128(xmm0, *t__[6]++);
143 xmm0 = _mm_xor_si128(xmm0, xmm1);
144 xmm0 = _mm_xor_si128(*m__, xmm0);
145 break;
146 case 6:
147 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
148 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
149 xmm0 = _mm_xor_si128(xmm0, *t__[4]++);
150 xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
151 xmm0 = _mm_xor_si128(xmm0, xmm1);
152 xmm0 = _mm_xor_si128(*m__, xmm0);
153 break;
154 case 5:
155 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
156 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
157 xmm0 = _mm_xor_si128(xmm0, *t__[4]++);
158 xmm0 = _mm_xor_si128(xmm0, xmm1);
159 xmm0 = _mm_xor_si128(*m__, xmm0);
160 break;
161 case 4:
162 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
163 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
164 xmm0 = _mm_xor_si128(xmm0, xmm1);
165 xmm0 = _mm_xor_si128(*m__, xmm0);
166 break;
167 case 3:
168 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
169 xmm1 = _mm_xor_si128(*m__, *t__[2]++);
170 xmm0 = _mm_xor_si128(xmm0, xmm1);
171 break;
172 case 2:
173 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
174 xmm0 = _mm_xor_si128(*m__, xmm0);
175 break;
176 case 1: xmm0 = _mm_xor_si128(*m__, *t__[0]++); break;
177 };
178 *m__++ = xmm0;
179 }
180
181 if (wide & 0x1) {
182 m = (word *)m__;
183 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
184 case 8: t[N - 8] = (word *)t__[N - 8];
185 case 7: t[N - 7] = (word *)t__[N - 7];
186 case 6: t[N - 6] = (word *)t__[N - 6];
187 case 5: t[N - 5] = (word *)t__[N - 5];
188 case 4: t[N - 4] = (word *)t__[N - 4];
189 case 3: t[N - 3] = (word *)t__[N - 3];
190 case 2: t[N - 2] = (word *)t__[N - 2];
191 case 1: t[N - 1] = (word *)t__[N - 1];
192 }
193
194 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
195 case 8:
196 *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
197 break;
198 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
199 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
200 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
201 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
202 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
203 case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
204 case 1: *m++ ^= *t[0]++; break;
205 }
206 }
207 return;
208#else
209
210 for (wi_t i = 0; i < wide; i++) {
211 switch (N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
212 case 8:
213 *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
214 break;
215 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
216 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
217 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
218 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
219 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
220 case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
221 case 1: *m++ ^= *t[0]++; break;
222 }
223 }
224
225 return;
226#endif // __M4RI_HAVE_SSE2
227}
Helper functions.
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w....
Definition misc.h:421
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition misc.h:449
int64_t wi_t
Type of word indexes.
Definition misc.h:81
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition misc.h:87