From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Use the SIMD Everywhere header only library

Always use the "AVX2" codepath as SIMDe will provide the non AVX2
equivalents automatically

--- a/src/mcf_simd.hh
+++ b/src/mcf_simd.hh
@@ -4,18 +4,13 @@
 #ifndef MCF_SIMD_HH
 #define MCF_SIMD_HH
 
-#if defined __SSE4_1__
-#include <immintrin.h>
-#elif defined __ARM_NEON
-#include <arm_neon.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/avx2.h"
 
 #include <stddef.h>  // size_t
 
 namespace mcf {
 
-#if defined __AVX2__
-
 typedef __m256i SimdInt;
 typedef __m256i SimdUint1;
 
@@ -146,279 +141,6 @@ static inline SimdInt simdChoose1(SimdIn
   return _mm256_shuffle_epi8(items, choices);
 }
 
-#elif defined __SSE4_1__
-
-typedef __m128i SimdInt;
-typedef __m128i SimdUint1;
-
-const int simdBytes = 16;
-
-static inline SimdInt simdZero() {
-  return _mm_setzero_si128();
-}
-
-static inline SimdInt simdZero1() {
-  return _mm_setzero_si128();
-}
-
-static inline SimdInt simdOnes1() {
-  return _mm_set1_epi32(-1);
-}
-
-static inline SimdInt simdLoad(const void *p) {
-  return _mm_loadu_si128((const SimdInt *)p);
-}
-
-static inline SimdInt simdLoad1(const void *p) {
-  return _mm_loadu_si128((const SimdInt *)p);
-}
-
-static inline void simdStore(void *p, SimdInt x) {
-  _mm_storeu_si128((SimdInt *)p, x);
-}
-
-static inline void simdStore1(void *p, SimdInt x) {
-  _mm_storeu_si128((SimdInt *)p, x);
-}
-
-static inline SimdInt simdOr1(SimdInt x, SimdInt y) {
-  return _mm_or_si128(x, y);
-}
-
-static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
-  return _mm_blendv_epi8(x, y, mask);  // SSE4.1
-}
-
-const int simdLen = 4;
-
-static inline SimdInt simdSet(int i3, int i2, int i1, int i0) {
-  return _mm_set_epi32(i3, i2, i1, i0);
-}
-
-static inline SimdInt simdSet1(char iF, char iE, char iD, char iC,
-			       char iB, char iA, char i9, char i8,
-			       char i7, char i6, char i5, char i4,
-			       char i3, char i2, char i1, char i0) {
-  return _mm_set_epi8(iF, iE, iD, iC, iB, iA, i9, i8,
-		      i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static inline SimdInt simdFill(int x) {
-  return _mm_set1_epi32(x);
-}
-
-static inline SimdInt simdFill1(char x) {
-  return _mm_set1_epi8(x);
-}
-
-static inline SimdInt simdGt(SimdInt x, SimdInt y) {
-  return _mm_cmpgt_epi32(x, y);
-}
-
-static inline SimdInt simdGe1(SimdInt x, SimdInt y) {
-  return _mm_cmpeq_epi8(_mm_min_epu8(x, y), y);
-}
-
-static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
-  return _mm_add_epi32(x, y);
-}
-
-static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
-  return _mm_add_epi8(x, y);
-}
-
-static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
-  return _mm_adds_epu8(x, y);
-}
-
-static inline SimdInt simdSub(SimdInt x, SimdInt y) {
-  return _mm_sub_epi32(x, y);
-}
-
-static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
-  return _mm_sub_epi8(x, y);
-}
-
-static inline SimdInt simdQuadruple1(SimdInt x) {
-  return _mm_slli_epi32(x, 2);
-}
-
-static inline SimdInt simdMax(SimdInt x, SimdInt y) {
-  return _mm_max_epi32(x, y);  // SSE4.1
-}
-
-static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
-  return _mm_min_epu8(x, y);
-}
-
-static inline int simdHorizontalMax(SimdInt x) {
-  x = simdMax(x, _mm_shuffle_epi32(x, 0x4E));
-  x = simdMax(x, _mm_shuffle_epi32(x, 0xB1));
-  return _mm_cvtsi128_si32(x);
-}
-
-static inline int simdHorizontalMin1(SimdInt x) {
-  x = _mm_min_epu8(x, _mm_srli_epi16(x, 8));
-  x = _mm_minpos_epu16(x);  // SSE4.1
-  return _mm_extract_epi16(x, 0);
-}
-
-static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
-  return _mm_shuffle_epi8(items, choices);  // SSSE3
-}
-
-#elif defined __ARM_NEON
-
-typedef int32x4_t SimdInt;
-typedef uint32x4_t SimdUint;
-typedef uint8x16_t SimdUint1;
-
-const int simdBytes = 16;
-
-static inline SimdInt simdZero() {
-  return vdupq_n_s32(0);
-}
-
-static inline SimdUint1 simdZero1() {
-  return vdupq_n_u8(0);
-}
-
-static inline SimdUint1 simdOnes1() {
-  return vdupq_n_u8(-1);
-}
-
-static inline SimdInt simdLoad(const int *p) {
-  return vld1q_s32(p);
-}
-
-static inline SimdUint1 simdLoad1(const unsigned char *p) {
-  return vld1q_u8(p);
-}
-
-static inline void simdStore(int *p, SimdInt x) {
-  vst1q_s32(p, x);
-}
-
-static inline void simdStore1(unsigned char *p, SimdUint1 x) {
-  vst1q_u8(p, x);
-}
-
-static inline SimdUint1 simdOr1(SimdUint1 x, SimdUint1 y) {
-  return vorrq_u8(x, y);
-}
-
-static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdUint mask) {
-  return vbslq_s32(mask, y, x);
-}
-
-const int simdLen = 4;
-
-static inline SimdInt simdSet(unsigned i3, unsigned i2,
-                              unsigned i1, unsigned i0) {
-  size_t lo = i1;
-  size_t hi = i3;
-  return
-    vcombine_s32(vcreate_s32((lo << 32) | i0), vcreate_s32((hi << 32) | i2));
-}
-
-static inline SimdUint1 simdSet1(unsigned char iF, unsigned char iE,
-				 unsigned char iD, unsigned char iC,
-				 unsigned char iB, unsigned char iA,
-				 unsigned char i9, unsigned char i8,
-				 unsigned char i7, unsigned char i6,
-				 unsigned char i5, unsigned char i4,
-				 unsigned char i3, unsigned char i2,
-				 unsigned char i1, unsigned char i0) {
-  size_t lo =
-    (size_t)i0       | (size_t)i1 <<  8 | (size_t)i2 << 16 | (size_t)i3 << 24 |
-    (size_t)i4 << 32 | (size_t)i5 << 40 | (size_t)i6 << 48 | (size_t)i7 << 56;
-
-  size_t hi =
-    (size_t)i8       | (size_t)i9 <<  8 | (size_t)iA << 16 | (size_t)iB << 24 |
-    (size_t)iC << 32 | (size_t)iD << 40 | (size_t)iE << 48 | (size_t)iF << 56;
-
-  return vcombine_u8(vcreate_u8(lo), vcreate_u8(hi));
-}
-
-static inline SimdInt simdFill(int x) {
-  return vdupq_n_s32(x);
-}
-
-static inline SimdUint1 simdFill1(unsigned char x) {
-  return vdupq_n_u8(x);
-}
-
-static inline SimdUint simdGt(SimdInt x, SimdInt y) {
-  return vcgtq_s32(x, y);
-}
-
-static inline SimdUint1 simdGe1(SimdUint1 x, SimdUint1 y) {
-  return vcgeq_u8(x, y);
-}
-
-static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
-  return vaddq_s32(x, y);
-}
-
-static inline SimdUint1 simdAdd1(SimdUint1 x, SimdUint1 y) {
-  return vaddq_u8(x, y);
-}
-
-static inline SimdUint1 simdAdds1(SimdUint1 x, SimdUint1 y) {
-  return vqaddq_u8(x, y);
-}
-
-static inline SimdInt simdSub(SimdInt x, SimdInt y) {
-  return vsubq_s32(x, y);
-}
-
-static inline SimdUint1 simdSub1(SimdUint1 x, SimdUint1 y) {
-  return vsubq_u8(x, y);
-}
-
-static inline SimdUint1 simdQuadruple1(SimdUint1 x) {
-  return vshlq_n_u8(x, 2);
-}
-
-static inline SimdInt simdMax(SimdInt x, SimdInt y) {
-  return vmaxq_s32(x, y);
-}
-
-static inline SimdUint1 simdMin1(SimdUint1 x, SimdUint1 y) {
-  return vminq_u8(x, y);
-}
-
-static inline int simdHorizontalMax(SimdInt x) {
-  return vmaxvq_s32(x);
-}
-
-static inline int simdHorizontalMin1(SimdUint1 x) {
-  return vminvq_u8(x);
-}
-
-static inline SimdUint1 simdChoose1(SimdUint1 items, SimdUint1 choices) {
-  return vqtbl1q_u8(items, choices);
-}
-
-#else
-
-typedef int SimdInt;
-const int simdBytes = 1;
-const int simdLen = 1;
-static inline int simdZero() { return 0; }
-static inline int simdSet(int x) { return x; }
-static inline int simdFill(int x) { return x; }
-static inline int simdLoad(const int *p) { return *p; }
-static inline void simdStore(int *p, int x) { *p = x; }
-static inline int simdGt(int x, int y) { return x > y; }
-static inline int simdAdd(int x, int y) { return x + y; }
-static inline int simdSub(int x, int y) { return x - y; }
-static inline int simdMax(int x, int y) { return x > y ? x : y; }
-static inline int simdBlend(int x, int y, int mask) { return mask ? y : x; }
-static inline int simdHorizontalMax(int a) { return a; }
-
-#endif
-
 }
 
 #endif
--- a/src/GappedXdropAligner.cc
+++ b/src/GappedXdropAligner.cc
@@ -140,17 +140,13 @@ int GappedXdropAligner::align(const ucha
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
-#if defined __SSE4_1__ || defined __ARM_NEON
-#ifdef __AVX2__
 			    s1[7][s2[7]],
 			    s1[6][s2[6]],
 			    s1[5][s2[5]],
 			    s1[4][s2[4]],
-#endif
 			    s1[3][s2[3]],
 			    s1[2][s2[2]],
 			    s1[1][s2[1]],
-#endif
 			    s1[0][s2[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
--- a/src/GappedXdropAlignerPssm.cc
+++ b/src/GappedXdropAlignerPssm.cc
@@ -91,17 +91,13 @@ int GappedXdropAligner::alignPssm(const
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
-#if defined __SSE4_1__ || defined __ARM_NEON
-#ifdef __AVX2__
 			    s2[-7][s1[7]],
 			    s2[-6][s1[6]],
 			    s2[-5][s1[5]],
 			    s2[-4][s1[4]],
-#endif
 			    s2[-3][s1[3]],
 			    s2[-2][s1[2]],
 			    s2[-1][s1[1]],
-#endif
 			    s2[-0][s1[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
--- a/makefile
+++ b/makefile
@@ -1,4 +1,4 @@
-CXXFLAGS = -msse4 -O3 -std=c++11 -pthread
+CXXFLAGS = -O3 -std=c++11 -pthread
 all:
 	@cd src && $(MAKE) CXXFLAGS="$(CXXFLAGS)"
 
--- a/src/makefile
+++ b/src/makefile
@@ -1,5 +1,4 @@
 CXXFLAGS = -O3 -Wall -Wextra -g
-CXXFLAGS += -msse4
 CXXFLAGS += -std=c++11
 CXXFLAGS += -pthread -DHAS_CXX_THREAD
 # -fomit-frame-pointer ?
@@ -56,11 +55,12 @@ split/last-split.o
 PPOBJ = last-pair-probs.o last-pair-probs-main.o
 
 MBOBJ = last-merge-batches.o
+SFX :=
 
-LAST4 = ../bin/lastdb ../bin/lastal ../bin/last-split
-LAST5 = ../bin/lastdb5 ../bin/lastal5 ../bin/last-split5
-LAST8 = ../bin/lastdb8 ../bin/lastal8 ../bin/last-split8
-ALL = $(LAST4) $(LAST5) ../bin/last-merge-batches ../bin/last-pair-probs
+LAST4 = ../bin/lastdb$(SFX) ../bin/lastal$(SFX) ../bin/last-split$(SFX)
+LAST5 = ../bin/lastdb5$(SFX) ../bin/lastal5$(SFX) ../bin/last-split5$(SFX)
+LAST8 = ../bin/lastdb8$(SFX) ../bin/lastal8$(SFX) ../bin/last-split8$(SFX)
+ALL = $(LAST4) $(LAST5) ../bin/last-merge-batches$(SFX) ../bin/last-pair-probs$(SFX)
 
 indexObj5 = $(indexObj4:.o=.o5)
 alignObj5 = $(alignObj4:.o=.o5)
@@ -74,45 +74,45 @@ all: $(ALL)
 last8: $(LAST8)
 
 indexAllObj4 = $(indexObj0) $(indexObj4)
-../bin/lastdb: $(indexAllObj4)
+../bin/lastdb$(SFX): $(indexAllObj4)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4) -lz
 
 indexAllObj5 = $(indexObj0) $(indexObj5)
-../bin/lastdb5: $(indexAllObj5)
+../bin/lastdb5$(SFX): $(indexAllObj5)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj5) -lz
 
 indexAllObj8 = $(indexObj0) $(indexObj8)
-../bin/lastdb8: $(indexAllObj8)
+../bin/lastdb8$(SFX): $(indexAllObj8)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8) -lz
 
 alignAllObj4 = $(alignObj0) $(alignObj4)
-../bin/lastal: $(alignAllObj4)
+../bin/lastal$(SFX): $(alignAllObj4)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4) -lz
 
 alignAllObj5 = $(alignObj0) $(alignObj5)
-../bin/lastal5: $(alignAllObj5)
+../bin/lastal5$(SFX): $(alignAllObj5)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj5) -lz
 
 alignAllObj8 = $(alignObj0) $(alignObj8)
-../bin/lastal8: $(alignAllObj8)
+../bin/lastal8$(SFX): $(alignAllObj8)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8) -lz
 
 splitAllObj4 = $(splitObj0) $(splitObj4)
-../bin/last-split: $(splitAllObj4)
+../bin/last-split$(SFX): $(splitAllObj4)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj4)
 
 splitAllObj5 = $(splitObj0) $(splitObj5)
-../bin/last-split5: $(splitAllObj5)
+../bin/last-split5$(SFX): $(splitAllObj5)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj5)
 
 splitAllObj8 = $(splitObj0) $(splitObj8)
-../bin/last-split8: $(splitAllObj8)
+../bin/last-split8$(SFX): $(splitAllObj8)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj8)
 
-../bin/last-pair-probs: $(PPOBJ)
+../bin/last-pair-probs$(SFX): $(PPOBJ)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) -lz
 
-../bin/last-merge-batches: $(MBOBJ)
+../bin/last-merge-batches$(SFX): $(MBOBJ)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ)
 
 .SUFFIXES:
--- a/src/GappedXdropAlignerDna.cc
+++ b/src/GappedXdropAlignerDna.cc
@@ -4,8 +4,6 @@
 #include "GappedXdropAligner.hh"
 #include "GappedXdropAlignerInl.hh"
 
-#if defined __SSE4_1__ || defined __ARM_NEON
-
 //#include <iostream>  // for debugging
 
 namespace cbrc {
@@ -43,12 +41,10 @@ int GappedXdropAligner::alignDna(const u
 
   const SimdUint1 scorer4x4 =
     simdSet1(
-#ifdef __AVX2__
 		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
 		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
 		 scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0],
-#endif
 		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
 		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
@@ -126,7 +122,6 @@ int GappedXdropAligner::alignDna(const u
 
       for (int i = 0; i < numCells; i += simdBytes) {
 	SimdUint1 s = simdSet1(
-#ifdef __AVX2__
 			     scorer[s1[31]][s2[31]],
 			     scorer[s1[30]][s2[30]],
 			     scorer[s1[29]][s2[29]],
@@ -143,7 +138,6 @@ int GappedXdropAligner::alignDna(const u
 			     scorer[s1[18]][s2[18]],
 			     scorer[s1[17]][s2[17]],
 			     scorer[s1[16]][s2[16]],
-#endif
 			     scorer[s1[15]][s2[15]],
 			     scorer[s1[14]][s2[14]],
 			     scorer[s1[13]][s2[13]],
@@ -275,5 +269,3 @@ bool GappedXdropAligner::getNextChunkDna
 }
 
 }
-
-#endif
--- a/src/Alignment.cc
+++ b/src/Alignment.cc
@@ -365,12 +365,10 @@ void Alignment::extend( std::vector< Seg
 				  del.openCost, del.growCost,
 				  ins.openCost, ins.growCost,
 				  gap.pairCost, gap.isAffine, maxDrop, smMax)
-#if defined __SSE4_1__ || defined __ARM_NEON
     : isSimdMatrix ? aligner.alignDna(seq1, seq2, isForward, sm,
 				      del.openCost, del.growCost,
 				      ins.openCost, ins.growCost,
 				      maxDrop, smMax, alph.numbersToUppercase)
-#endif
     :           aligner.align(seq1, seq2, isForward, globality, sm,
 			      del.openCost, del.growCost,
 			      ins.openCost, ins.growCost,
@@ -387,14 +385,12 @@ void Alignment::extend( std::vector< Seg
       while( greedyAligner.getNextChunk( end1, end2, size ) )
 	chunks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
     }
-#if defined __SSE4_1__ || defined __ARM_NEON
     else if (isSimdMatrix && !pssm2 && !sm2qual) {
       while (aligner.getNextChunkDna(end1, end2, size,
 				     del.openCost, del.growCost,
 				     ins.openCost, ins.growCost))
 	chunks.push_back(SegmentPair(end1 - size, end2 - size, size));
     }
-#endif
     else {
       while( aligner.getNextChunk( end1, end2, size,
 				   del.openCost, del.growCost,
--- a/src/GappedXdropAligner.hh
+++ b/src/GappedXdropAligner.hh
@@ -352,7 +352,6 @@ class GappedXdropAligner {
   void initFrame();
 
   // Everything below here is for alignDna & getNextChunkDna
-#if defined __SSE4_1__ || defined __ARM_NEON
   std::vector<TinyScore> xTinyScores;
   std::vector<TinyScore> yTinyScores;
   std::vector<TinyScore> zTinyScores;
@@ -402,7 +401,6 @@ class GappedXdropAligner {
     while (*x2 != target) ++x2;
     bestSeq1position = x2 - x2beg + seq1beg;
   }
-#endif
 };
 
 }
