Windows implementation WIP

2016-07-10 03:33:58 -07:00
parent 16b731cea9
commit 3a7c1ba0eb
156 changed files with 86861 additions and 75 deletions
--- a/3rdparty/ffts/ffts-master/src/Makefile.am
+++ b/3rdparty/ffts/ffts-master/src/Makefile.am
@@ -0,0 +1,34 @@
+
+
+lib_LTLIBRARIES = libffts.la
+
+libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c 
+libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
+
+if DYNAMIC_DISABLED
+libffts_la_SOURCES += ffts_static.c
+else
+libffts_la_SOURCES += codegen.c
+endif
+
+libffts_includedir=$(includedir)/ffts
+libffts_include_HEADERS = ../include/ffts.h
+
+
+if HAVE_VFP
+libffts_la_SOURCES += vfp.s 
+else
+if HAVE_NEON
+
+libffts_la_SOURCES += neon.s
+
+if DYNAMIC_DISABLED
+libffts_la_SOURCES += neon_static_f.s neon_static_i.s
+endif
+
+else 
+if HAVE_SSE
+libffts_la_SOURCES += sse.s
+endif
+endif
+endif
--- a/3rdparty/ffts/ffts-master/src/Makefile.in
+++ b/3rdparty/ffts/ffts-master/src/Makefile.in
@@ -0,0 +1,730 @@
+# Makefile.in generated by automake 1.14 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
+@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
+@HAVE_VFP_TRUE@am__append_3 = vfp.s 
+@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon.s
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon_static_f.s neon_static_i.s
+@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s
+subdir = src
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(top_srcdir)/depcomp $(libffts_include_HEADERS)
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
+	$(top_srcdir)/m4/ax_check_java_home.m4 \
+	$(top_srcdir)/m4/ax_java_options.m4 \
+	$(top_srcdir)/m4/ax_jni_include_dir.m4 \
+	$(top_srcdir)/m4/ax_prog_jar.m4 \
+	$(top_srcdir)/m4/ax_prog_javac.m4 \
+	$(top_srcdir)/m4/ax_prog_javac_works.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__installdirs = "$(DESTDIR)$(libdir)" \
+	"$(DESTDIR)$(libffts_includedir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libffts_la_LIBADD =
+am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \
+	ffts_real.c ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
+	codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
+	ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
+	macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
+	patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s neon.s \
+	neon_static_f.s neon_static_i.s sse.s
+@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
+@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
+@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
+@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon.lo
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon_static_f.lo \
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@	neon_static_i.lo
+@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 =  \
+@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@	sse.lo
+am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
+	ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
+	$(am__objects_3) $(am__objects_4) $(am__objects_5) \
+	$(am__objects_6)
+libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
+	$(CCASFLAGS)
+AM_V_CCAS = $(am__v_CCAS_@AM_V@)
+am__v_CCAS_ = $(am__v_CCAS_@AM_DEFAULT_V@)
+am__v_CCAS_0 = @echo "  CCAS    " $@;
+am__v_CCAS_1 = 
+SOURCES = $(libffts_la_SOURCES)
+DIST_SOURCES = $(am__libffts_la_SOURCES_DIST)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+HEADERS = $(libffts_include_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+JAR = @JAR@
+JAVA = @JAVA@
+JAVAC = @JAVAC@
+JAVACFLAGS = @JAVACFLAGS@
+JAVAFLAGS = @JAVAFLAGS@
+JAVAPREFIX = @JAVAPREFIX@
+JAVA_PATH_NAME = @JAVA_PATH_NAME@
+JNI_CPPFLAGS = @JNI_CPPFLAGS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+_ACJNI_JAVAC = @_ACJNI_JAVAC@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+lib_LTLIBRARIES = libffts.la
+libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \
+	ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
+	codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
+	ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
+	macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
+	patterns.h types.h vfp.h $(am__append_1) $(am__append_2) \
+	$(am__append_3) $(am__append_4) $(am__append_5) \
+	$(am__append_6)
+libffts_includedir = $(includedir)/ffts
+libffts_include_HEADERS = ../include/ffts.h
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj .s
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu src/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	list2=; for p in $$list; do \
+	  if test -f $$p; then \
+	    list2="$$list2 $$p"; \
+	  else :; fi; \
+	done; \
+	test -z "$$list2" || { \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+	}
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+	for p in $$list; do \
+	  $(am__strip_dir) \
+	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+.s.o:
+	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $<
+
+.s.obj:
+	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.s.lo:
+	$(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+install-libffts_includeHEADERS: $(libffts_include_HEADERS)
+	@$(NORMAL_INSTALL)
+	@list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
+	if test -n "$$list"; then \
+	  echo " $(MKDIR_P) '$(DESTDIR)$(libffts_includedir)'"; \
+	  $(MKDIR_P) "$(DESTDIR)$(libffts_includedir)" || exit 1; \
+	fi; \
+	for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; \
+	done | $(am__base_list) | \
+	while read files; do \
+	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libffts_includedir)'"; \
+	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(libffts_includedir)" || exit $$?; \
+	done
+
+uninstall-libffts_includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
+	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
+	dir='$(DESTDIR)$(libffts_includedir)'; $(am__uninstall_files_from_dir)
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libffts_includedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am: install-libffts_includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-libLTLIBRARIES \
+	uninstall-libffts_includeHEADERS
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-libLTLIBRARIES clean-libtool cscopelist-am ctags \
+	ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-libLTLIBRARIES \
+	install-libffts_includeHEADERS install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am uninstall-libLTLIBRARIES \
+	uninstall-libffts_includeHEADERS
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
--- a/3rdparty/ffts/ffts-master/src/codegen.c
+++ b/3rdparty/ffts/ffts-master/src/codegen.c
@@ -0,0 +1,732 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "codegen.h"
+#include "macros.h"
+#include "ffts.h"
+
+#ifdef __APPLE__
+	#include <libkern/OSCacheControl.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#ifdef HAVE_NEON
+	#include "codegen_arm.h"
+	#include "neon.h"
+#elif HAVE_VFP
+	#include "codegen_arm.h"
+	#include "vfp.h"
+#else
+	#include "codegen_sse.h"
+	#include "macros-sse.h"
+#endif
+
+#ifdef __ANDROID__
+	#include <unistd.h>
+#endif
+
+int tree_count(int N, int leafN, int offset) {
+	
+	if(N <= leafN) return 0;
+	int count = 0;	
+	count += tree_count(N/4, leafN, offset);
+	count += tree_count(N/8, leafN, offset + N/4);
+	count += tree_count(N/8, leafN, offset + N/4 + N/8);
+	count += tree_count(N/4, leafN, offset + N/2);
+	count += tree_count(N/4, leafN, offset + 3*N/4);
+
+	return 1 + count;
+}
+
+void elaborate_tree(size_t **p, int N, int leafN, int offset) {
+	
+	if(N <= leafN) return;
+	elaborate_tree(p, N/4, leafN, offset);
+	elaborate_tree(p, N/8, leafN, offset + N/4);
+	elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
+	elaborate_tree(p, N/4, leafN, offset + N/2);
+	elaborate_tree(p, N/4, leafN, offset + 3*N/4);
+
+	(*p)[0] = N;
+	(*p)[1] = offset*2;
+
+	(*p)+=2;
+}
+
+
+
+
+uint32_t LUT_offset(size_t N, size_t leafN) {
+		int i;
+		size_t p_lut_size = 0;
+		size_t lut_size = 0;
+		int hardcoded = 0;
+		size_t n_luts = __builtin_ctzl(N/leafN);
+		int n = leafN*2;
+		//if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+		
+		for(i=0;i<n_luts-1;i++) {
+			p_lut_size = lut_size;
+			if(!i || hardcoded) {
+			#ifdef __arm__
+				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
+			else lut_size += n/4 * sizeof(cdata_t);
+			#else
+				lut_size += n/4 * 2 * sizeof(cdata_t);
+			#endif
+			//	n *= 2;
+			} else {
+			#ifdef __arm__ 
+				lut_size += n/8 * 3 * sizeof(cdata_t);
+			#else
+				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+			#endif
+			}
+			n *= 2;
+		}
+	return lut_size;
+}
+
+#ifdef __arm__ 
+	typedef uint32_t insns_t;
+#else
+	typedef uint8_t insns_t;
+#endif
+
+#define P(x) (*(*p)++ = x)
+
+void insert_nops(uint8_t **p, uint32_t count) {
+	switch(count) {
+		case 0: break;
+		case 2: P(0x66); 
+		case 1: P(0x90); break;
+		case 3: P(0x0F); P(0x1F); P(0x00); break;	
+		case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;	
+		case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;	
+		case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;	
+		case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;	
+		case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;	
+		case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;	
+		default:
+			P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); 
+			insert_nops(p, count-9);
+			break;	
+	}
+}
+
+
+void align_mem16(uint8_t **p, uint32_t offset) {
+#ifdef __x86_64__
+	int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
+	r = (16 + r) & 0xf;
+	insert_nops(p, r);	
+#endif
+}
+
+void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
+	int count = tree_count(N, leafN, 0) + 1;
+	size_t *ps = malloc(count * 2 * sizeof(size_t));
+	size_t *pps = ps;
+
+#ifdef __x86_64__
+	if(sign < 0) p->constants = sse_constants;
+	else         p->constants = sse_constants_inv;
+#endif
+
+	elaborate_tree(&pps, N, leafN, 0);
+	pps[0] = 0;
+	pps[1] = 0;
+
+	pps = ps;
+
+#ifdef __arm__ 
+	if(N < 8192) p->transform_size = 8192;
+	else p->transform_size = N;
+#else
+	if(N < 2048) p->transform_size = 16384;
+	else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
+#endif
+
+#ifdef __APPLE__
+	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
+#else
+#define MAP_ANONYMOUS 0x20	
+	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+#endif
+
+/*
+	if(p->transform_base == MAP_FAILED) {
+		fprintf(stderr, "MAP FAILED\n");
+		exit(1);
+	}*/
+	insns_t *func = p->transform_base;//valloc(8192);
+	insns_t *fp = func;
+
+//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
+//fprintf(stderr, "Base address = %016p\n", func);
+
+	if(!func) { 
+		fprintf(stderr, "NOMEM\n");
+		exit(1);
+	}
+
+	insns_t *x_8_addr = fp;
+#ifdef __arm__
+#ifdef HAVE_NEON
+	memcpy(fp, neon_x8, neon_x8_t - neon_x8);
+	/* 
+	 * Changes adds to subtracts and  vice versa to allow the computation 
+	 * of both the IFFT and FFT
+	 */
+	if(sign < 0) {
+		fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
+		fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
+		fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
+	}
+	fp += (neon_x8_t - neon_x8) / 4;
+#else
+	memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+	if(sign > 0) {
+		fp[65] ^= 0x00000040; 
+		fp[66] ^= 0x00000040; 
+		fp[68] ^= 0x00000040; 
+		fp[70] ^= 0x00000040; 
+		fp[103] ^= 0x00000040; 
+		fp[104] ^= 0x00000040; 
+		fp[105] ^= 0x00000040; 
+		fp[108] ^= 0x00000040; 
+		fp[113] ^= 0x00000040; 
+		fp[114] ^= 0x00000040; 
+		fp[117] ^= 0x00000040; 
+		fp[118] ^= 0x00000040; 
+	}
+	fp += (vfp_end - vfp_x8) / 4;
+#endif
+#else
+	align_mem16(&fp, 0);
+	x_8_addr = fp;
+	align_mem16(&fp, 5);
+	memcpy(fp, x8_soft, x8_hard - x8_soft);
+	fp += (x8_hard - x8_soft);
+//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
+#endif
+//uint32_t *x_8_t_addr = fp;
+//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
+//fp += (neon_end - neon_x8_t) / 4;
+	insns_t *x_4_addr = fp;
+#ifdef __arm__
+	#ifdef HAVE_NEON
+		memcpy(fp, neon_x4, neon_x8 - neon_x4);
+		if(sign < 0) {
+			fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
+		}
+		fp += (neon_x8 - neon_x4) / 4;
+	#else
+		memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+		if(sign > 0) {
+			fp[36] ^= 0x00000040; 
+			fp[38] ^= 0x00000040; 
+			fp[43] ^= 0x00000040; 
+			fp[44] ^= 0x00000040;
+		}
+		fp += (vfp_x8 - vfp_x4) / 4;
+	#endif
+#else
+	align_mem16(&fp, 0);
+	x_4_addr = fp;
+	memcpy(fp, x4, x8_soft - x4);
+	fp += (x8_soft - x4);
+
+#endif
+	insns_t *start = fp;
+
+#ifdef __arm__ 
+	*fp = PUSH_LR(); fp++;
+	*fp = 0xed2d8b10; fp++;
+
+	ADDI(&fp, 3, 1, 0);
+	ADDI(&fp, 7, 1, N);
+	ADDI(&fp, 5, 1, 2*N);
+	ADDI(&fp, 10, 7, 2*N);
+	ADDI(&fp, 4, 5, 2*N);
+	ADDI(&fp, 8, 10, 2*N);
+	ADDI(&fp, 6, 4, 2*N);
+	ADDI(&fp, 9, 8, 2*N);
+	
+  *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
+//  *fp++ = LDRI(1, 0, 4); // load ws into r1
+	ADDI(&fp, 1, 0, 0);
+
+	ADDI(&fp, 0, 2, 0), // mov out into r0
+#endif
+
+
+#ifdef __arm__
+		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
+  	#ifdef HAVE_NEON
+  		MOVI(&fp, 11, p->i0);
+  	#else 
+  		MOVI(&fp, 11, p->i0);
+  	#endif
+
+#else
+	align_mem16(&fp, 0);
+	start = fp;
+	
+	*fp++ = 0x4c;
+	*fp++ = 0x8b;
+	*fp++ = 0x07;
+	uint32_t lp_cnt = p->i0 * 4;
+	MOVI(&fp, RCX, lp_cnt);
+	
+	//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); 
+#endif
+	//fp++;
+#ifdef __arm__
+#ifdef HAVE_NEON
+		memcpy(fp, neon_ee, neon_oo - neon_ee);
+		if(sign < 0) {
+			fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+			fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+			fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+		}
+		fp += (neon_oo - neon_ee) / 4;
+#else
+			memcpy(fp, vfp_e, vfp_o - vfp_e);
+			if(sign > 0) {
+				fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
+				fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
+				fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
+			}
+			fp += (vfp_o - vfp_e) / 4;
+#endif
+#else
+//fprintf(stderr, "Body start address = %016p\n", start);
+	
+	PUSH(&fp, RBP);	
+	PUSH(&fp, RBX);
+	PUSH(&fp, R10);
+	PUSH(&fp, R11);
+	PUSH(&fp, R12);
+	PUSH(&fp, R13);
+	PUSH(&fp, R14);
+	PUSH(&fp, R15);
+	
+	int i;
+	memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
+	
+//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
+//fprintf(stderr, "Constants address = %016p\n", sse_constants);
+//fprintf(stderr, "Constants address = %016p\n", p->constants);
+	
+//int32_t val = READ_IMM32(fp + 3);
+//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
+
+//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
+//fprintf(stderr, "IMM = 0x%llx\n", v2);
+
+//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); 
+	fp += (leaf_ee - leaf_ee_init);
+
+//fprintf(stderr, "Leaf start address = %016p\n", fp);
+	align_mem16(&fp, 9);
+	memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
+
+
+	uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
+	uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
+	uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
+	
+	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); 
+	
+	fp += (leaf_oo - leaf_ee);
+	
+	if(__builtin_ctzl(N) & 1){
+
+		if(p->i1) {
+			lp_cnt += p->i1 * 4;
+			MOVI(&fp, RCX, lp_cnt);
+			align_mem16(&fp, 4);
+			memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
+			for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
+			fp += (leaf_eo - leaf_oo);
+		}
+		
+
+		memcpy(fp, leaf_oe, leaf_end - leaf_oe);
+		lp_cnt += 4;
+		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); 
+		fp += (leaf_end - leaf_oe);
+
+	}else{
+
+
+		memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
+		lp_cnt += 4;
+		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); 
+		fp += (leaf_oe - leaf_eo);
+
+		if(p->i1) {
+			lp_cnt += p->i1 * 4;
+			MOVI(&fp, RCX, lp_cnt);
+			align_mem16(&fp, 4);
+			memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
+			for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
+			fp += (leaf_eo - leaf_oo);
+		}
+
+	}
+	if(p->i1) {
+		lp_cnt += p->i1 * 4;
+		MOVI(&fp, RCX, lp_cnt);
+		align_mem16(&fp, 9);
+		memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
+		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); 
+		fp += (leaf_oo - leaf_ee);
+
+	}
+	
+//fprintf(stderr, "Body start address = %016p\n", fp);
+  //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); 
+	memcpy(fp, x_init, x4 - x_init);
+//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); 
+	fp += (x4 - x_init);
+
+	int32_t pAddr = 0;
+	int32_t pN = 0;
+	int32_t pLUT = 0;
+	count = 2;
+	while(pps[0]) {
+	
+		if(!pN) {
+			MOVI(&fp, RCX, pps[0] / 4);
+		}else{
+  		if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
+			if(pps[0] > leafN && pps[0] - pN) {
+				
+				int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
+				*fp++ = 0xc1; 
+				
+				if(diff > 0) {
+					*fp++ = 0xe1;
+					*fp++ = (diff & 0xff);
+				}else{
+					*fp++ = 0xe9;
+					*fp++ = ((-diff) & 0xff);
+				}
+			}
+		}
+		
+  		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+  			ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 
+
+
+		if(pps[0] == 2*leafN) {
+			CALL(&fp, x_4_addr);
+	//  	}else if(!pps[2]){
+	//	  //uint32_t *x_8_t_addr = fp;
+	//		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+	//		fp += (neon_ee - neon_x8_t) / 4;
+	//		//*fp++ = BL(fp+2, x_8_t_addr);
+		}else{
+			CALL(&fp, x_8_addr);
+		}
+
+		pAddr = pps[1] * 4;
+		if(pps[0] > leafN) 
+			pN = pps[0];
+		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
+//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
+		count += 4;
+		pps += 2;
+	}
+#endif
+#ifdef __arm__
+#ifdef HAVE_NEON
+	if(__builtin_ctzl(N) & 1){
+		ADDI(&fp, 2, 7, 0);
+		ADDI(&fp, 7, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 8, 0);
+		ADDI(&fp, 8, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+	
+		if(p->i1) {
+			MOVI(&fp, 11, p->i1);
+			memcpy(fp, neon_oo, neon_eo - neon_oo);
+			if(sign < 0) {
+				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
+				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
+				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+			}
+			fp += (neon_eo - neon_oo) / 4;
+		}
+		
+		*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; 
+
+		memcpy(fp, neon_oe, neon_end - neon_oe);
+		if(sign < 0) {
+			fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
+			fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
+			fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
+		}
+		fp += (neon_end - neon_oe) / 4;
+
+	}else{
+		
+		*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; 
+
+		memcpy(fp, neon_eo, neon_oe - neon_eo);
+		if(sign < 0) {
+			fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
+			fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
+			fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
+		}
+		fp += (neon_oe - neon_eo) / 4;
+		
+		ADDI(&fp, 2, 7, 0);
+		ADDI(&fp, 7, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 8, 0);
+		ADDI(&fp, 8, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+	
+		if(p->i1) {
+			MOVI(&fp, 11, p->i1);
+			memcpy(fp, neon_oo, neon_eo - neon_oo);
+			if(sign < 0) {
+				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
+				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
+				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+			}
+			fp += (neon_eo - neon_oo) / 4;
+		}
+
+	}
+
+
+	if(p->i1) {
+		ADDI(&fp, 2, 3, 0);
+		ADDI(&fp, 3, 7, 0);
+		ADDI(&fp, 7, 2, 0);
+
+		ADDI(&fp, 2, 4, 0);
+		ADDI(&fp, 4, 8, 0);
+		ADDI(&fp, 8, 2, 0);
+		
+		ADDI(&fp, 2, 5, 0);
+		ADDI(&fp, 5, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 6, 0);
+		ADDI(&fp, 6, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+
+		ADDI(&fp, 2, 9, 0);
+		ADDI(&fp, 9, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+
+		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
+	  MOVI(&fp, 11, p->i1);
+  	memcpy(fp, neon_ee, neon_oo - neon_ee);
+  	if(sign < 0) {
+  		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+  		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+  		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+  	}
+		fp += (neon_oo - neon_ee) / 4;
+
+	}
+#else
+		ADDI(&fp, 2, 7, 0);
+		ADDI(&fp, 7, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 8, 0);
+		ADDI(&fp, 8, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+	
+			MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+  		memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+		if(sign > 0) {
+			fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
+			fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
+		}
+  		fp += (vfp_x4 - vfp_o) / 4;
+		
+		ADDI(&fp, 2, 3, 0);
+		ADDI(&fp, 3, 7, 0);
+		ADDI(&fp, 7, 2, 0);
+
+		ADDI(&fp, 2, 4, 0);
+		ADDI(&fp, 4, 8, 0);
+		ADDI(&fp, 8, 2, 0);
+		
+		ADDI(&fp, 2, 5, 0);
+		ADDI(&fp, 5, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 6, 0);
+		ADDI(&fp, 6, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+
+		ADDI(&fp, 2, 9, 0);
+		ADDI(&fp, 9, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+
+		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
+	  MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+  	memcpy(fp, vfp_e, vfp_o - vfp_e);
+		if(sign > 0) {
+			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
+			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
+			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
+		}
+  	fp += (vfp_o - vfp_e) / 4;
+
+#endif
+  *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
+	//ADDI(&fp, 2, 1, 0);
+	MOVI(&fp, 1, 0);
+	
+	// args: r0 - out
+	//       r1 - N
+	//       r2 - ws
+//	ADDI(&fp, 3, 1, 0); // put N into r3 for counter
+
+	int32_t pAddr = 0;
+	int32_t pN = 0;
+	int32_t pLUT = 0;
+	count = 2;
+	while(pps[0]) {
+	
+//	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
+		if(!pN) {
+			MOVI(&fp, 1, pps[0]);
+		}else{
+  		if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
+			if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
+		}
+		
+		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+			ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 
+
+
+  	if(pps[0] == 2*leafN) {
+        *fp = BL(fp+2, x_4_addr); fp++;
+  	}else if(!pps[2]){
+  	  //uint32_t *x_8_t_addr = fp;
+#ifdef HAVE_NEON
+  		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+  		if(sign < 0) {
+  			fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
+  			fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
+   			fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
+  		}
+  		fp += (neon_ee - neon_x8_t) / 4;
+  		//*fp++ = BL(fp+2, x_8_t_addr);
+
+#else
+    		*fp = BL(fp+2, x_8_addr); fp++;
+#endif
+  	}else{
+    		*fp = BL(fp+2, x_8_addr); fp++;
+  	}
+
+		pAddr = pps[1] * 4;
+		pN = pps[0];
+		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
+//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
+		count += 4;
+		pps += 2;
+	}
+	
+	*fp++ = 0xecbd8b10;
+	*fp++ = POP_LR(); count++;
+#else
+	POP(&fp, R15);
+	POP(&fp, R14);
+	POP(&fp, R13);
+	POP(&fp, R12);
+	POP(&fp, R11);
+	POP(&fp, R10);
+	POP(&fp, RBX);
+	POP(&fp, RBP);
+	RET(&fp);	
+
+
+//uint8_t *pp = func;
+//int counter = 0;
+//do{ 
+//	printf("%02x ", *pp);
+//	if(counter++ % 16 == 15) printf("\n");
+//} while(++pp < fp);
+
+//printf("\n");
+
+
+#endif
+
+
+//	*fp++ = B(14); count++;
+
+//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) 
+//	fprintf(stderr, "%08x\n", x_4_addr[i]);
+//fprintf(stderr, "\n");
+//for(int i=0;i<count;i++) 
+
+	free(ps);
+	
+  if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
+  	perror("Couldn't mprotect");
+  	exit(1);
+  }
+#ifdef __APPLE__
+	sys_icache_invalidate(func, p->transform_size);
+#elif __ANDROID__
+	cacheflush((long)(func), (long)(func) + p->transform_size, 0);
+#elif __linux__
+#ifdef __GNUC__
+	__clear_cache((long)(func), (long)(func) + p->transform_size);
+#endif
+#endif
+
+//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
+
+	p->transform = (void *) (start);
+}
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/codegen.h
+++ b/3rdparty/ffts/ffts-master/src/codegen.h
@@ -0,0 +1,50 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __CODEGEN_H__
+#define __CODEGEN_H__
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <limits.h>	   /* for PAGESIZE */
+
+#include "ffts.h"
+
+void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign); 
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/codegen_arm.h
+++ b/3rdparty/ffts/ffts-master/src/codegen_arm.h
@@ -0,0 +1,102 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __CODEGEN_ARM_H__
+#define __CODEGEN_ARM_H__
+
+
+
+uint32_t BL(void *pos, void *target) {
+	return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
+}
+
+uint32_t B(uint8_t r) {
+	return 0xe12fff10 | r;
+}
+
+uint32_t MOV(uint8_t dst, uint8_t src) {
+		return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
+}
+
+void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
+	int32_t oimm = imm;
+	if(imm < 0) {
+		imm = -imm;
+		uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
+		if(shamt & 1) shamt -= 1;
+		imm >>= shamt;
+		shamt = (32 - shamt)/2;
+		
+	//	if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+		*(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+	
+		if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
+
+	}else{
+		uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
+		if(shamt & 1) shamt -= 1;
+		imm >>= shamt;
+		shamt = (32 - shamt)/2;
+
+//		if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+
+		*(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+		
+		if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
+	}
+}
+
+uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
+	return 0xe5900000 | ((dst & 0xf) << 12)
+	                  | ((base & 0xf) << 16) | (offset & 0xfff) ;
+}
+
+void MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
+	uint32_t oimm = imm;
+	
+		uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
+		if(shamt & 1) shamt -= 1;
+		imm >>= shamt;
+		shamt = (32 - shamt)/2;
+	*(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
+		if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
+}
+
+uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
+uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
+
+
+
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/codegen_sse.h
+++ b/3rdparty/ffts/ffts-master/src/codegen_sse.h
@@ -0,0 +1,196 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#ifndef __CODEGEN_SSE_H__
+#define __CODEGEN_SSE_H__
+
+void neon_x4(float *, size_t, float *);
+void neon_x8(float *, size_t, float *);
+void neon_x8_t(float *, size_t, float *);
+void leaf_ee_init();
+void leaf_ee();
+void leaf_oo();
+void leaf_eo();
+void leaf_oe();
+void leaf_end();
+void x_init();
+void x4();
+void x8_soft();
+void x8_hard();
+
+void sse_constants();
+void sse_constants_inv();
+
+//	typedef uint8_t insns_t;
+
+extern const uint32_t sse_leaf_ee_offsets[8];
+extern const uint32_t sse_leaf_oo_offsets[8];
+extern const uint32_t sse_leaf_eo_offsets[8];
+extern const uint32_t sse_leaf_oe_offsets[8];
+
+#define EAX 0
+#define ECX 1
+#define EDX 2
+#define EBX 3
+#define ESI 6
+#define EDI 7
+#define EBP 5
+
+#define RAX 0
+#define RCX 1
+#define RDX 2
+#define RBX 3
+#define RSI 6
+#define RDI 7
+#define RBP 5
+#define R8 8 
+#define R9 9 
+#define R10 10 
+#define R11 11 
+#define R12 12 
+#define R13 13 
+#define R14 14 
+#define R15 15 
+
+void IMM8(uint8_t **p, int32_t imm) {
+		*(*p)++ = (imm & 0xff);
+}
+
+void IMM16(uint8_t **p, int32_t imm) {
+	int i;
+	for(i=0;i<2;i++) {
+		*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
+	} 
+}
+void IMM32(uint8_t **p, int32_t imm) {
+	int i;
+	for(i=0;i<4;i++) {
+		*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
+	} 
+}
+void IMM32_NI(uint8_t *p, int32_t imm) {
+	int i;
+	for(i=0;i<4;i++) {
+		*(p+i) = (imm & (0xff << (i*8))) >> (i*8);
+	} 
+}
+
+int32_t READ_IMM32(uint8_t *p) {
+	int32_t rval = 0;
+	int i;
+	for(i=0;i<4;i++) {
+		rval |= *(p+i) << (i*8);
+	} 
+	return rval;
+}
+
+void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
+//  if(imm < 65536) *(*p)++ = 0x66; 
+  if(dst >= 8) *(*p)++ = 0x41;
+
+  //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66; 
+
+  //if(imm >= 256) 
+	*(*p)++ = 0xb8 | (dst & 0x7);
+//  else           *(*p)++ = 0xb0 | (dst & 0x7);
+
+ // if(imm < 256) IMM8(p, imm);
+//  else 
+//if(imm < 65536) IMM16(p, imm);
+//else 
+	IMM32(p, imm);
+
+//if(dst < 8) {
+//	*(*p)++ = 0xb8 + dst; 
+//}else{
+//	*(*p)++ = 0x49;
+//	*(*p)++ = 0xc7;
+//	*(*p)++ = 0xc0 | (dst - 8);
+//}
+//IMM32(p, imm);
+}
+
+void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
+	if(disp == 0) {
+		*(*p)++ = (rm & 7) | ((reg & 7) << 3);
+	}else if(disp <= 127 || disp >= -128) {
+		*(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
+		IMM8(p, disp);
+	}else{
+		*(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
+		IMM32(p, disp);
+	}
+}
+
+void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
+
+	*(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
+	*(*p)++ = 0x8d;
+	ADDRMODE(p, dst, base, disp);		
+}
+
+void RET(uint8_t **p) {
+	*(*p)++ = 0xc3;
+}
+
+void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
+	
+	if(dst >= 8) *(*p)++ = 0x49;
+	else         *(*p)++ = 0x48;
+	
+	if(imm > 127 || imm <= -128) *(*p)++ = 0x81;
+	else          *(*p)++ = 0x83;
+	
+	*(*p)++ = 0xc0 | (dst & 0x7);
+
+	if(imm > 127 || imm <= -128) IMM32(p, imm);
+	else          IMM8(p, imm);
+}
+
+void CALL(uint8_t **p, uint8_t *func) {
+	*(*p)++ = 0xe8;
+	IMM32(p, ((void *)func) - (void *)(*p) - 4); 
+}
+
+void PUSH(uint8_t **p, uint8_t reg) {
+	if(reg >= 8) *(*p)++ = 0x41;
+	*(*p)++ = 0x50 | (reg & 7);
+}
+void POP(uint8_t **p, uint8_t reg) {
+	if(reg >= 8) *(*p)++ = 0x41;
+	*(*p)++ = 0x58 | (reg & 7);
+}
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts.c
+++ b/3rdparty/ffts/ffts-master/src/ffts.c
@@ -0,0 +1,416 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "ffts.h"
+#include "macros.h"
+//#include "mini_macros.h"
+#include "patterns.h"
+#include "ffts_small.h"
+
+#ifdef DYNAMIC_DISABLED
+	#include "ffts_static.h"
+#else
+	#include "codegen.h"
+#endif
+
+#include <errno.h>
+  #include <sys/mman.h>
+  #include <string.h>
+  #include <limits.h>	   /* for PAGESIZE */
+
+#if __APPLE__
+  #include <libkern/OSCacheControl.h>
+#else
+#endif
+
+void ffts_execute(ffts_plan_t *p, const void *  in, void *  out) {
+
+//TODO: Define NEEDS_ALIGNED properly instead 
+#if defined(HAVE_SSE) || defined(HAVE_NEON)
+	if(((int)in % 16) != 0) {
+		LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
+	}
+
+	if(((int)out % 16) != 0) {
+		LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
+	}
+#endif
+
+	p->transform(p, (const float *)in, (float *)out);
+}
+
+void ffts_free(ffts_plan_t *p) {
+	p->destroy(p);
+}
+
+void ffts_free_1d(ffts_plan_t *p) {
+	
+	size_t i;
+
+	if(p->ws) {
+		FFTS_FREE(p->ws);
+	}
+	if(p->is) free(p->is);
+	if(p->ws_is) free(p->ws_is);
+	if(p->offsets)		free(p->offsets);
+	//free(p->transforms);
+	if(p->transforms) free(p->transforms);
+
+	if(p->transform_base) {
+		if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
+			perror("Couldn't mprotect");
+			exit(errno);
+		}
+		munmap(p->transform_base, p->transform_size);
+		//free(p->transform_base);
+	}
+	free(p);
+}
+
+ffts_plan_t *ffts_init_1d(size_t N, int sign) {
+	if(N == 0 || (N & (N - 1)) != 0){
+		LOG("FFT size must be a power of two\n");
+		return NULL;
+	}
+
+	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+	size_t leafN = 8;	
+	size_t i;	
+
+#ifdef __arm__
+//#ifdef HAVE_NEON
+	V MULI_SIGN;
+	
+	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+//#endif 
+#else
+	V MULI_SIGN;
+	
+	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+#endif
+
+	p->transform = NULL;
+	p->transform_base = NULL;
+	p->transforms = NULL;
+	p->is = NULL;
+	p->ws_is = NULL;
+	p->ws = NULL;
+	p->offsets = NULL;
+	p->destroy = ffts_free_1d;
+
+	if(N >= 32) {
+		ffts_init_offsets(p, N, leafN);
+#ifdef __arm__
+#ifdef HAVE_NEON
+		ffts_init_is(p, N, leafN, 1);
+#else
+		ffts_init_is(p, N, leafN, 1);
+#endif
+#else
+		ffts_init_is(p, N, leafN, 1);
+#endif
+		
+		p->i0 = N/leafN/3+1;
+		p->i1 = N/leafN/3;
+		if((N/leafN) % 3 > 1) p->i1++;
+		p->i2 = N/leafN/3;
+		
+		#ifdef __arm__	
+		#ifdef HAVE_NEON
+		p->i0/=2;
+		p->i1/=2;
+		#endif
+		#else
+		p->i0/=2;
+		p->i1/=2;
+		#endif
+
+	}else{
+		p->transforms = malloc(2 * sizeof(transform_index_t));
+		p->transforms[0] = 0;
+		p->transforms[1] = 1;
+		if(N == 2) p->transform = &firstpass_2;
+		else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
+		else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
+		else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
+		else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
+		else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
+		else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
+
+		p->is = NULL;
+		p->offsets = NULL;
+	}
+
+		int hardcoded = 0;
+
+		/*      LUTS           */
+		size_t n_luts = __builtin_ctzl(N/leafN);
+		if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
+
+		if(n_luts >= 32) n_luts = 0;
+
+//		fprintf(stderr, "n_luts = %zu\n", n_luts);
+		
+		cdata_t *w;
+
+		int n = leafN*2;
+		if(hardcoded) n = 8;
+		
+		size_t lut_size = 0;
+
+		for(i=0;i<n_luts;i++) {
+			if(!i || hardcoded) {
+			#ifdef __arm__ 
+				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
+				else lut_size += n/4 * sizeof(cdata_t);
+			#else
+				lut_size += n/4 * 2 * sizeof(cdata_t);
+			#endif
+				n *= 2;
+			} else {
+			#ifdef __arm__
+				lut_size += n/8 * 3 * sizeof(cdata_t);
+			#else
+				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+			#endif
+			}
+			n *= 2;
+		}
+		
+//		lut_size *= 16;
+		
+	//	fprintf(stderr, "lut size = %zu\n", lut_size);
+		if(n_luts) {
+			p->ws = FFTS_MALLOC(lut_size,32);
+			p->ws_is = malloc(n_luts * sizeof(size_t));
+		}else{
+			p->ws = NULL;
+			p->ws_is = NULL;
+		}
+		w = p->ws;
+
+		n = leafN*2;
+		if(hardcoded) n = 8;
+		
+		#ifdef HAVE_NEON
+			V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
+		#endif
+		
+		for(i=0;i<n_luts;i++) {
+			p->ws_is[i] = w - (cdata_t *)p->ws;	
+			//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);	
+			
+			if(!i || hardcoded) {
+				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+
+				size_t j;
+				for(j=0;j<n/4;j++) {
+					w0[j][0]	= W_re(n,j);
+					w0[j][1]	= W_im(n,j);
+				}
+
+
+				float *fw0 = (float *)w0;
+				#ifdef __arm__
+					if(N < 32) {
+						//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+						float *fw = (float *)w;
+						V temp0, temp1, temp2;
+						for(j=0;j<n/4;j+=2) {
+						//	#ifdef HAVE_NEON
+							temp0 = VLD(fw0 + j*2);
+							V re, im;
+							re = VDUPRE(temp0);
+							im = VDUPIM(temp0);
+							#ifdef HAVE_NEON 
+								im = VXOR(im, MULI_SIGN);
+								//im = IMULI(sign>0, im);
+							#else
+								im = MULI(sign>0, im);
+							#endif
+							VST(fw + j*4  , re);
+							VST(fw + j*4+4, im);
+					//		#endif
+						}
+						w += n/4 * 2;
+					}else{
+						//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+						float *fw = (float *)w;
+						#ifdef HAVE_NEON
+							VS temp0, temp1, temp2;
+							for(j=0;j<n/4;j+=4) {
+								temp0 = VLD2(fw0 + j*2);
+								temp0.val[1] = VXOR(temp0.val[1], neg);
+								STORESPR(fw + j*2, temp0);
+							}
+						#else
+							for(j=0;j<n/4;j+=1) {
+								fw[j*2] = fw0[j*2];
+								fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+							}
+						#endif
+						w += n/4;
+					}
+				#else
+					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					V temp0, temp1, temp2;
+					for(j=0;j<n/4;j+=2) {
+						temp0 = VLD(fw0 + j*2);
+						V re, im;
+						re = VDUPRE(temp0);
+						im = VDUPIM(temp0);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*4  , re);
+						VST(fw + j*4+4, im);
+					}
+					w += n/4 * 2;
+				#endif
+
+				FFTS_FREE(w0);
+			}else{
+
+				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+
+				size_t j;
+				for(j=0;j<n/8;j++) {
+					w0[j][0]	= W_re(n,j*2);
+					w0[j][1]	= W_im(n,j*2);
+					w1[j][0]	= W_re(n,j);
+					w1[j][1]	= W_im(n,j);
+					w2[j][0]	= W_re(n,j + (n/8));
+					w2[j][1]	= W_im(n,j + (n/8));
+
+				}
+
+				float *fw0 = (float *)w0;
+				float *fw1 = (float *)w1;
+				float *fw2 = (float *)w2;
+				#ifdef __arm__
+					//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					#ifdef HAVE_NEON	
+						VS temp0, temp1, temp2;
+						for(j=0;j<n/8;j+=4) {
+							temp0 = VLD2(fw0 + j*2);
+							temp0.val[1] = VXOR(temp0.val[1], neg);
+							STORESPR(fw + j*2*3,      temp0);
+							temp1 = VLD2(fw1 + j*2);
+							temp1.val[1] = VXOR(temp1.val[1], neg);
+							STORESPR(fw + j*2*3 + 8,  temp1);
+							temp2 = VLD2(fw2 + j*2);
+							temp2.val[1] = VXOR(temp2.val[1], neg);
+							STORESPR(fw + j*2*3 + 16, temp2);
+						}
+					#else
+						for(j=0;j<n/8;j+=1) {
+								fw[j*6] = fw0[j*2];
+								fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+								fw[j*6+2] = fw1[j*2+0];
+								fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
+								fw[j*6+4] = fw2[j*2+0];
+								fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
+						}
+					#endif
+					w += n/8 * 3;
+				#else
+					//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+					float *fw = (float *)w;
+					V temp0, temp1, temp2, re, im;
+					for(j=0;j<n/8;j+=2) {
+						temp0 = VLD(fw0 + j*2);
+						re = VDUPRE(temp0);
+						im = VDUPIM(temp0);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*2*6  , re);
+						VST(fw + j*2*6+4, im);
+
+						temp1 = VLD(fw1 + j*2);
+						re = VDUPRE(temp1);
+						im = VDUPIM(temp1);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*2*6+8 , re);
+						VST(fw + j*2*6+12, im);
+
+						temp2 = VLD(fw2 + j*2);
+						re = VDUPRE(temp2);
+						im = VDUPIM(temp2);
+						im = VXOR(im, MULI_SIGN);
+						VST(fw + j*2*6+16, re);
+						VST(fw + j*2*6+20, im);
+					}
+					w += n/8 * 3 * 2;
+				#endif
+
+				FFTS_FREE(w0);
+				FFTS_FREE(w1);
+				FFTS_FREE(w2);
+			}
+			///p->ws[i] = w;
+
+			n *= 2;
+		}
+
+	float *tmp = (float *)p->ws;
+
+	if(sign < 0) {
+		p->oe_ws = (void *)(&w_data[4]);
+		p->ee_ws = (void *)(w_data);
+		p->eo_ws = (void *)(&w_data[4]);
+	}else{
+		p->oe_ws = (void *)(w_data + 12);
+		p->ee_ws = (void *)(w_data + 8);
+		p->eo_ws = (void *)(w_data + 12);
+	}
+
+	p->N = N;
+	p->lastlut = w;
+	p->n_luts = n_luts;
+#ifdef DYNAMIC_DISABLED
+	if(sign < 0) { 
+		if(N >= 32) p->transform = ffts_static_transform_f; 
+	}else{
+		if(N >= 32) p->transform = ffts_static_transform_i; 
+	}
+
+#else
+	if(N>=32)  ffts_generate_func_code(p, N, leafN, sign);
+#endif
+
+	return p;
+}
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts.h
+++ b/3rdparty/ffts/ffts-master/src/ffts.h
@@ -0,0 +1,186 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef __CP_SSE_H__
+#define __CP_SSE_H__
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+//#include <stdalign.h>
+
+//#include "codegen.h"
+#include "types.h"
+
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#define LOG(s) __android_log_print(ANDROID_LOG_ERROR, "FFTS", s)
+#else
+#define LOG(s) fprintf(stderr, s)
+#endif
+
+#define PI 3.1415926535897932384626433832795028841971693993751058209
+
+static const __attribute__ ((aligned(64))) float w_data[16] = {
+	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
+	-0.70710678118654757273731092936941,	-0.70710678118654746171500846685376,
+    1.0f,									 0.70710678118654757273731092936941f, 
+	-0.0f,									-0.70710678118654746171500846685376,
+	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
+	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
+	1.0f,									0.70710678118654757273731092936941f, 
+	0.0f,									0.70710678118654746171500846685376
+};
+
+__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
+__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
+
+typedef size_t transform_index_t;
+
+//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
+typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
+
+typedef struct _ffts_plan_t ffts_plan_t;
+
+/**
+ * Contains all the Information need to perform FFT
+ *
+ *
+ * DO NOT CHANGE THE ORDER OF MEMBERS
+ * ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE
+ * SOME OF THESE VARIABES!!
+ */
+struct _ffts_plan_t {
+
+	/**
+	 * 
+	 */
+	ptrdiff_t *offsets;
+#ifdef DYNAMIC_DISABLED
+	/**
+	 * Twiddle factors
+	 */
+	void *ws;
+	/**
+	 * ee - 2 size x  size8 
+	 * oo - 2 x size4 in parallel
+	 * oe - 
+	 */
+	void  *oe_ws, *eo_ws, *ee_ws;
+#else
+	void __attribute__((aligned(32))) *ws;
+	void __attribute__((aligned(32)))  *oe_ws, *eo_ws, *ee_ws;
+#endif
+	/** 
+	 * Pointer into an array of precomputed indexes for the input data array
+	 */
+	ptrdiff_t *is; 
+
+	/**
+	 * Twiddle Factor Indexes
+	 */
+	size_t *ws_is;
+	
+	/** 
+	 * Size of the loops for the base cases
+	 */
+	size_t i0, i1, n_luts;
+
+	/**
+	 * Size fo the Transform
+	 */
+	size_t N;
+	void *lastlut;
+	/**
+	 * Used in multidimensional Code ??
+	 */
+	transform_index_t *transforms; 
+	//transform_func_t transform;
+	
+	/** 
+	 * Pointer to the dynamically generated function 
+	 * that will execute the FFT
+	 */
+	void (*transform)(ffts_plan_t * , const void * , void * );
+
+	/**
+	 * Pointer to the base memory address of 
+	 * of the transform function
+	 */
+	void *transform_base;
+
+	/**
+	 * Size of the memory block contain the 
+	 * generated code
+	 */
+	size_t transform_size;
+
+	/**
+	 * Points to the cosnant variables used by
+	 * the Assembly Code 
+	 */
+	void *constants;
+	
+	// multi-dimensional stuff:
+	struct _ffts_plan_t **plans;
+	int rank;
+	size_t *Ns, *Ms;
+	void *buf;
+
+	void *transpose_buf;
+
+	/**
+	 * Pointer to the destroy function
+	 * to clean up the plan after use
+	 * (differs for real and multi dimension transforms
+	 */
+	void (*destroy)(ffts_plan_t *);
+
+	/**
+	 * Coefficiants for the real valued transforms
+	 */
+	float *A, *B;
+			
+	size_t i2;
+};
+
+
+void ffts_free(ffts_plan_t *);
+ffts_plan_t *ffts_init_1d(size_t N, int sign); 
+void ffts_execute(ffts_plan_t *, const void *, void *);
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_nd.c
+++ b/3rdparty/ffts/ffts-master/src/ffts_nd.c
@@ -0,0 +1,314 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_nd.h"
+
+#ifdef HAVE_NEON
+#include "neon.h"
+#endif
+
+void ffts_free_nd(ffts_plan_t *p) {
+
+	int i;
+	for(i=0;i<p->rank;i++) {
+		
+		ffts_plan_t *x = p->plans[i];
+		int k;
+		for(k=0;k<i;k++) {
+			if(p->Ms[i] == p->Ms[k]) x = NULL;
+		}
+		
+		if(x)	ffts_free(x);
+	}
+
+	free(p->Ns);
+	free(p->Ms);
+	free(p->plans);
+	free(p->buf);
+	free(p->transpose_buf);
+	free(p);
+}
+#define TSIZE 8
+#include <string.h>
+void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+
+#ifdef HAVE_NEON 
+	size_t i,j,k;
+	int linebytes = w*8;
+
+	for(j=0;j<h;j+=8) {
+		for(i=0;i<w;i+=8) {
+			neon_transpose_to_buf(in + j*w + i, buf, w);
+
+			uint64_t *p = out + i*h + j;
+			uint64_t *pbuf = buf;
+			uint64_t *ptemp;
+#if defined(__aarch64__) || defined(__arm64__)
+// This particular function comes out nicely using arm64 intrinsics; no need to deal with inline asm
+			{
+			uint64x2_t q8,q9,q10,q11,q12,q13,q14,q15;
+			int x;
+			for (x=0; x<4; x++)
+			{
+				ptemp = p;
+				p += w;
+				q8 = vld1q_u64(&pbuf[0]);
+				q9 = vld1q_u64(&pbuf[2]);
+				q10 = vld1q_u64(&pbuf[4]);
+				q11 = vld1q_u64(&pbuf[6]);
+				q12 = vld1q_u64(&pbuf[8]);
+				q13 = vld1q_u64(&pbuf[10]);
+				q14 = vld1q_u64(&pbuf[12]);
+				q15 = vld1q_u64(&pbuf[14]);
+				pbuf += 16;
+				vst1q_u64(&ptemp[0], q8);
+				vst1q_u64(&ptemp[2], q9);
+				vst1q_u64(&ptemp[4], q10);
+				vst1q_u64(&ptemp[6], q11);
+				ptemp = p;
+				p += w;
+				vst1q_u64(&ptemp[0], q12);
+				vst1q_u64(&ptemp[2], q13);
+				vst1q_u64(&ptemp[4], q14);
+				vst1q_u64(&ptemp[6], q15);
+			} // for x
+			} // aarch64
+#else
+			__asm__ __volatile__(
+							 "mov %[ptemp], %[p]\n\t"
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t" 
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t"
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t" 
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t"
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t" 
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t"
+							 "add %[p], %[p], %[w], lsl #3\n\t"
+							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
+							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
+							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
+							 "mov %[ptemp], %[p]\n\t" 
+							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
+							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
+							 
+						: [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
+						: [w] "r" (w)
+						: "memory", "q8", "q9", "q10", "q11"
+						);
+#endif // 32 vs 64-bit version
+//			out[i*h + j] = in[j*w + i];
+		}
+	}
+#else
+#ifdef HAVE_SSE
+	uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
+	int tx, ty;
+	int x, y;
+	int tw = w / TSIZE;
+	int th = h / TSIZE;
+	for (ty=0;ty<th;ty++) {
+		for (tx=0;tx<tw;tx++) {
+			uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
+			uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE;
+
+			// Copy/transpose to tmp
+			for (y=0;y<TSIZE;y+=2) {
+				//for (x=0;x<TSIZE;x+=2) {
+					//op[x*TSIZE] = ip[x];
+					__m128d q0 = _mm_load_pd((double *)(ip0 + 0*w));
+					__m128d q1 = _mm_load_pd((double *)(ip0 + 1*w));
+					__m128d q2 = _mm_load_pd((double *)(ip0 + 2*w));
+					__m128d q3 = _mm_load_pd((double *)(ip0 + 3*w));
+					__m128d q4 = _mm_load_pd((double *)(ip0 + 4*w));
+					__m128d q5 = _mm_load_pd((double *)(ip0 + 5*w));
+					__m128d q6 = _mm_load_pd((double *)(ip0 + 6*w));
+					__m128d q7 = _mm_load_pd((double *)(ip0 + 7*w));
+					ip0 += 2;	
+					
+					__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+					__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+					__m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
+					__m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
+					__m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
+					__m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
+					__m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
+					__m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
+					//_mm_store_pd((double *)(op0 + y*h + x), t0);
+					//_mm_store_pd((double *)(op0 + y*h + x + h), t1);
+					_mm_store_pd((double *)(op0 + 0), t0);
+					_mm_store_pd((double *)(op0 + 0 + TSIZE), t1);
+					_mm_store_pd((double *)(op0 + 2 ), t2);
+					_mm_store_pd((double *)(op0 + 2 + TSIZE), t3);
+					_mm_store_pd((double *)(op0 + 4 ), t4);
+					_mm_store_pd((double *)(op0 + 4 + TSIZE), t5);
+					_mm_store_pd((double *)(op0 + 6 ), t6);
+					_mm_store_pd((double *)(op0 + 6 + TSIZE), t7);
+				//}
+				op0 += 2*TSIZE;
+			}
+			
+			op0 = out + h*tx*TSIZE + ty*TSIZE;
+			ip0 = tmp;
+			for (y=0;y<TSIZE;y+=1) {
+		//		memcpy(op0, ip0, TSIZE * sizeof(*ip0));
+				
+				__m128d q0 = _mm_load_pd((double *)(ip0 + 0));
+				__m128d q1 = _mm_load_pd((double *)(ip0 + 2));
+				__m128d q2 = _mm_load_pd((double *)(ip0 + 4));
+				__m128d q3 = _mm_load_pd((double *)(ip0 + 6));
+				_mm_store_pd((double *)(op0 + 0), q0);
+				_mm_store_pd((double *)(op0 + 2), q1);
+				_mm_store_pd((double *)(op0 + 4), q2);
+				_mm_store_pd((double *)(op0 + 6), q3);
+				
+				op0 += h;
+				ip0 += TSIZE;
+			}
+
+		}
+	}
+/*
+	size_t i,j;
+	for(i=0;i<w;i+=2) {
+		for(j=0;j<h;j+=2) {
+//		out[i*h + j] = in[j*w + i];
+  		__m128d q0 = _mm_load_pd((double *)(in + j*w + i));
+  		__m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
+  		__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+  		__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+  		_mm_store_pd((double *)(out + i*h + j), t0);
+  		_mm_store_pd((double *)(out + i*h + j + h), t1);
+		}
+	}
+*/
+#endif
+#endif
+
+}
+
+void ffts_execute_nd(ffts_plan_t *p, const void *  in, void *  out) {
+
+	uint64_t *din = (uint64_t *)in;
+	uint64_t *buf = p->buf;
+	uint64_t *dout = (uint64_t *)out;
+
+	size_t i,j;
+	for(i=0;i<p->Ns[0];i++) {
+		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));	
+	}
+	ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);	
+
+	for(i=1;i<p->rank;i++) {
+		for(j=0;j<p->Ns[i];j++) { 
+			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));	
+		}
+		ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);	
+	}
+}
+
+ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) {
+	size_t vol = 1;
+
+	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+
+	p->transform = &ffts_execute_nd;
+	p->destroy = &ffts_free_nd;
+
+	p->rank = rank;
+	p->Ns = malloc(sizeof(size_t) * rank);
+	p->Ms = malloc(sizeof(size_t) * rank);
+	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
+	int i;
+	for(i=0;i<rank;i++) {
+		p->Ns[i] = Ns[i];
+		vol *= Ns[i];	
+	}
+	p->buf = valloc(sizeof(float) * 2 * vol);
+
+	for(i=0;i<rank;i++) {
+		p->Ms[i] = vol / p->Ns[i];
+
+		p->plans[i] = NULL;
+		int k;
+		for(k=0;k<i;k++) {
+			if(p->Ms[k] == p->Ms[i]) 
+				p->plans[i] = p->plans[k];
+		}
+
+		if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); 
+	}
+
+	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
+	return p;
+}
+
+
+ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) {
+	size_t Ns[2];
+	Ns[0] = N1;
+	Ns[1] = N2;
+	return ffts_init_nd(2, Ns, sign);
+}
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_nd.h
+++ b/3rdparty/ffts/ffts-master/src/ffts_nd.h
@@ -0,0 +1,59 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_ND_H__
+#define __FFTS_ND_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "ffts.h"
+
+#ifdef HAVE_NEON 
+	#include <arm_neon.h>
+#endif
+#ifdef HAVE_SSE
+	#include <xmmintrin.h>
+#endif
+
+void ffts_free_nd(ffts_plan_t *p);
+void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf);
+
+void ffts_execute_nd(ffts_plan_t *p, const void *  in, void *  out); 
+ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); 
+ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); 
+
+#endif
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_real.c
+++ b/3rdparty/ffts/ffts-master/src/ffts_real.c
@@ -0,0 +1,227 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_real.h"
+
+void ffts_free_1d_real(ffts_plan_t *p) {
+	ffts_free(p->plans[0]);
+	free(p->A);
+	free(p->B);
+	free(p->plans);
+	free(p->buf);
+	free(p);
+}
+
+void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) {
+	float *out = (float *)vout;
+	float *buf = (float *)p->buf;
+	float *A = p->A;
+	float *B = p->B;
+
+	p->plans[0]->transform(p->plans[0], vin, buf);
+
+	size_t N = p->N;
+	buf[N] = buf[0];
+	buf[N+1] = buf[1];
+
+	float *p_buf0 = buf;
+	float *p_buf1 = buf + N - 2;
+	float *p_out = out;
+
+	size_t i;
+#ifdef __ARM_NEON__
+	for(i=0;i<N/2;i+=2) {
+	__asm__ __volatile__ ("vld1.32 {q8},  [%[pa], :128]!\n\t"
+												"vld1.32 {q9},  [%[pb], :128]!\n\t"
+												"vld1.32 {q10}, [%[buf0], :128]!\n\t"
+												"vld1.32 {q11}, [%[buf1], :64]\n\t"
+												"sub %[buf1], %[buf1], #16\n\t"
+
+												"vdup.32 d26, d16[1]\n\t"
+												"vdup.32 d27, d17[1]\n\t"
+												"vdup.32 d24, d16[0]\n\t"
+												"vdup.32 d25, d17[0]\n\t"
+												
+												"vdup.32 d30, d23[1]\n\t"
+												"vdup.32 d31, d22[1]\n\t"
+												"vdup.32 d28, d23[0]\n\t"
+												"vdup.32 d29, d22[0]\n\t"
+												
+												"vmul.f32 q13, q13, q10\n\t"
+												"vmul.f32 q15, q15, q9\n\t"
+												"vmul.f32 q12, q12, q10\n\t"
+												"vmul.f32 q14, q14, q9\n\t"
+												"vrev64.f32 q13, q13\n\t"
+												"vrev64.f32 q15, q15\n\t"
+
+												"vtrn.32 d26, d27\n\t"
+												"vtrn.32 d30, d31\n\t"
+												"vneg.f32 d26, d26\n\t"
+												"vneg.f32 d31, d31\n\t"
+												"vtrn.32 d26, d27\n\t"
+												"vtrn.32 d30, d31\n\t"
+												
+												"vadd.f32 q12, q12, q14\n\t"
+												"vadd.f32 q13, q13, q15\n\t"
+												"vadd.f32 q12, q12, q13\n\t"
+												"vst1.32 {q12}, [%[pout], :128]!\n\t"
+												: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+													[pout] "+r" (p_out)
+												:
+												: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+												);
+#else
+	for(i=0;i<N/2;i++) {
+		out[2*i]   = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1];
+		out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i];
+
+//	out[2*N-2*i] = out[2*i];
+//	out[2*N-2*i+1] = -out[2*i+1];
+
+#endif	
+	}
+	
+	out[N] = buf[0] - buf[1];
+	out[N+1] = 0.0f;
+	
+}
+
+void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
+	float *out = (float *)vout;
+	float *in = (float *)vin;
+	float *buf = (float *)p->buf;
+	float *A = p->A;
+	float *B = p->B;
+	size_t N = p->N;
+	
+	float *p_buf0 = in;
+	float *p_buf1 = in + N - 2;
+	
+	float *p_out = buf;
+	
+	size_t i;
+#ifdef __ARM_NEON__
+	for(i=0;i<N/2;i+=2) {
+	__asm__ __volatile__ ("vld1.32 {q8},  [%[pa], :128]!\n\t"
+												"vld1.32 {q9},  [%[pb], :128]!\n\t"
+												"vld1.32 {q10}, [%[buf0], :128]!\n\t"
+												"vld1.32 {q11}, [%[buf1], :64]\n\t"
+												"sub %[buf1], %[buf1], #16\n\t"
+
+												"vdup.32 d26, d16[1]\n\t"
+												"vdup.32 d27, d17[1]\n\t"
+												"vdup.32 d24, d16[0]\n\t"
+												"vdup.32 d25, d17[0]\n\t"
+												
+												"vdup.32 d30, d23[1]\n\t"
+												"vdup.32 d31, d22[1]\n\t"
+												"vdup.32 d28, d23[0]\n\t"
+												"vdup.32 d29, d22[0]\n\t"
+												
+												"vmul.f32 q13, q13, q10\n\t"
+												"vmul.f32 q15, q15, q9\n\t"
+												"vmul.f32 q12, q12, q10\n\t"
+												"vmul.f32 q14, q14, q9\n\t"
+												"vrev64.f32 q13, q13\n\t"
+												"vrev64.f32 q15, q15\n\t"
+
+												"vtrn.32 d26, d27\n\t"
+												"vtrn.32 d28, d29\n\t"
+												"vneg.f32 d27, d27\n\t"
+												"vneg.f32 d29, d29\n\t"
+												"vtrn.32 d26, d27\n\t"
+												"vtrn.32 d28, d29\n\t"
+												
+												"vadd.f32 q12, q12, q14\n\t"
+												"vsub.f32 q13, q13, q15\n\t"
+												"vadd.f32 q12, q12, q13\n\t"
+												"vst1.32 {q12}, [%[pout], :128]!\n\t"
+												: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
+													[pout] "+r" (p_out)
+												:
+												: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+												);
+
+
+#else
+	for(i=0;i<N/2;i++) {
+		buf[2*i]   = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
+		buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
+#endif
+}
+	
+	p->plans[0]->transform(p->plans[0], buf, out);
+	
+}
+
+ffts_plan_t *ffts_init_1d_real(size_t N, int sign) {
+	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+
+	if(sign < 0) p->transform = &ffts_execute_1d_real;
+	else         p->transform = &ffts_execute_1d_real_inv;
+	
+	p->destroy = &ffts_free_1d_real;
+	p->N = N;
+	p->rank = 1;
+	p->plans = malloc(sizeof(ffts_plan_t **) * 1);
+
+	p->plans[0] = ffts_init_1d(N/2, sign); 
+
+	p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1));
+
+	p->A = valloc(sizeof(float) * N);
+	p->B = valloc(sizeof(float) * N);
+
+  if(sign < 0) {
+		int i;
+		for (i = 0; i < N/2; i++) {
+			p->A[2 * i]     = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
+			p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+			p->B[2 * i]     = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
+			p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+		}
+	}else{
+		int i;
+		for (i = 0; i < N/2; i++) {
+			p->A[2 * i]     = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
+			p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+			p->B[2 * i]     = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
+			p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
+		}
+  }
+	
+	return p;
+}
+
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_real.h
+++ b/3rdparty/ffts/ffts-master/src/ffts_real.h
@@ -0,0 +1,54 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_REAL_H__
+#define __FFTS_REAL_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "ffts.h"
+
+#ifdef HAVE_NEON 
+	#include <arm_neon.h>
+#endif
+#ifdef HAVE_SSE
+	#include <xmmintrin.h>
+#endif
+
+ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
+
+#endif
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_real_nd.c
+++ b/3rdparty/ffts/ffts-master/src/ffts_real_nd.c
@@ -0,0 +1,197 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_real_nd.h"
+
+#ifdef __ARM_NEON__
+#include "neon.h"
+#endif
+
+void ffts_free_nd_real(ffts_plan_t *p) {
+
+	int i;
+	for(i=0;i<p->rank;i++) {
+		
+		ffts_plan_t *x = p->plans[i];
+
+		int k;
+		for(k=i+1;k<p->rank;k++) {
+			if(x == p->plans[k]) p->plans[k] = NULL;
+		}
+		
+		if(x)	ffts_free(x);
+	}
+
+	free(p->Ns);
+	free(p->Ms);
+	free(p->plans);
+	free(p->buf);
+	free(p->transpose_buf);
+	free(p);
+}
+
+void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+
+	size_t i,j;
+	for(i=0;i<w;i+=1) {
+		for(j=0;j<h;j+=1) {
+			out[i*h + j] = in[j*w + i];
+		}
+	}
+
+}
+
+void ffts_execute_nd_real(ffts_plan_t *p, const void *  in, void *  out) {
+
+	uint32_t *din = (uint32_t *)in;
+	uint64_t *buf = p->buf;
+	uint64_t *dout = (uint64_t *)out;
+
+	size_t i,j;
+	for(i=0;i<p->Ns[0];i++) {
+		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));	
+	}
+	ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);	
+
+	for(i=1;i<p->rank;i++) {
+		for(j=0;j<p->Ns[i];j++) { 
+			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));	
+		}
+		ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);	
+	}
+}
+
+void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *  in, void *  out) {
+
+	uint64_t *din = (uint64_t *)in;
+	uint64_t *buf = p->buf;
+	uint64_t *buf2;
+	uint64_t *dout = (uint64_t *)out;
+	size_t vol = 1;
+	
+	float *bufr = (float *)(p->buf);
+	float *doutr = (float *)out;
+
+	size_t i,j;
+
+	for(i=0;i<p->rank;i++) {
+		vol *= p->Ns[i];
+	}
+
+	buf2 = buf + vol;
+
+	ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);	
+
+	for(i=0;i<p->Ms[0];i++) {
+		p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0]));	
+	}
+	
+	ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf);	
+	for(j=0;j<p->Ms[1];j++) { 
+  	p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);	
+  }
+}
+
+ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
+	size_t vol = 1;
+	size_t bufsize;
+
+	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+
+	if(sign < 0) p->transform = &ffts_execute_nd_real;
+	else         p->transform = &ffts_execute_nd_real_inv;
+
+	p->destroy = &ffts_free_nd_real;
+
+	p->rank = rank;
+	p->Ns = malloc(sizeof(size_t) * rank);
+	p->Ms = malloc(sizeof(size_t) * rank);
+	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
+	int i;
+	for(i=0;i<rank;i++) {
+		p->Ns[i] = Ns[i];
+		vol *= Ns[i];	
+	}
+	
+	//There is probably a prettier way of doing this, but it works..
+	if(sign < 0) {
+		bufsize = 2 * vol;
+	}
+	else {
+		bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
+	}
+
+	p->buf = valloc(sizeof(float) * bufsize);
+
+	for(i=0;i<rank;i++) {
+		p->Ms[i] = vol / p->Ns[i];
+		
+		p->plans[i] = NULL;
+		int k;
+
+		if(sign < 0) {
+			for(k=1;k<i;k++) {
+				if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
+			}
+			if(!i)                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); 
+			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); 
+		}else{
+  		for(k=0;k<i;k++) {
+  			if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
+  		}
+			if(i==rank-1)         p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); 
+			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign); 
+		}
+	}
+	if(sign < 0) {
+		for(i=1;i<rank;i++) {
+			p->Ns[i] = p->Ns[i] / 2 + 1;
+		}
+	}else{
+		for(i=0;i<rank-1;i++) {
+			p->Ms[i] = p->Ms[i] / 2 + 1;
+		}
+	}
+
+	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
+	return p;
+}
+
+
+ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
+	size_t Ns[2];
+	Ns[0] = N1;
+	Ns[1] = N2;
+	return ffts_init_nd_real(2, Ns, sign);
+}
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_real_nd.h
+++ b/3rdparty/ffts/ffts-master/src/ffts_real_nd.h
@@ -0,0 +1,54 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_REAL_ND_H__
+#define __FFTS_REAL_ND_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "ffts_nd.h"
+#include "ffts_real.h"
+#include "ffts.h"
+
+#ifdef HAVE_NEON 
+	#include <arm_neon.h>
+#endif
+#ifdef HAVE_SSE
+	#include <xmmintrin.h>
+#endif
+
+#endif
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_small.c
+++ b/3rdparty/ffts/ffts-master/src/ffts_small.c
@@ -0,0 +1,157 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts.h"
+#include "macros.h"
+
+#include <stdlib.h>
+
+#define DEBUG(x)
+
+#include "ffts_small.h"
+
+ void firstpass_16_f(ffts_plan_t *  p, const void *  in, void *  out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+    float *LUT8 = p->ws;
+
+    L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
+    L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
+    K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+    K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
+    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
+    K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
+    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+}
+
+ void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+    float *LUT8 = p->ws;
+
+    L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
+    L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
+    K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
+    K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
+    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
+    K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
+    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
+}
+
+
+ void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    V r0_1, r2_3, r4_5, r6_7;
+    float *LUT8 = p->ws + p->ws_is[0];
+
+    L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+}
+
+ void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    V r0_1, r2_3, r4_5, r6_7;
+    float *LUT8 = p->ws + p->ws_is[0];
+
+    L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
+}
+
+
+ void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+    t0[0] = din[0]; t0[1] = din[1];
+    t1[0] = din[4]; t1[1] = din[5];
+    t2[0] = din[2]; t2[1] = din[3];
+    t3[0] = din[6]; t3[1] = din[7];
+        
+    t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
+    t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
+    t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
+    t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
+
+    dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
+    dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
+    dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
+    dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
+}
+
+ void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
+    t0[0] = din[0]; t0[1] = din[1];
+    t1[0] = din[4]; t1[1] = din[5];
+    t2[0] = din[2]; t2[1] = din[3];
+    t3[0] = din[6]; t3[1] = din[7];
+        
+    t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
+    t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
+    t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
+    t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
+
+    dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
+    dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
+    dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
+    dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
+}
+
+ void firstpass_2(ffts_plan_t *p, const void *in, void *out)
+{
+    const data_t *din = (const data_t *)in;
+    data_t *dout = (data_t *)out;
+    cdata_t t0, t1, r0,r1;
+    t0[0] = din[0]; t0[1] = din[1];
+    t1[0] = din[2]; t1[1] = din[3];
+    r0[0] = t0[0] + t1[0];
+    r0[1] = t0[1] + t1[1];
+    r1[0] = t0[0] - t1[0];
+    r1[1] = t0[1] - t1[1];
+    dout[0] = r0[0]; dout[1] = r0[1];
+    dout[2] = r1[0]; dout[3] = r1[1];
+}
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_small.h
+++ b/3rdparty/ffts/ffts-master/src/ffts_small.h
@@ -0,0 +1,14 @@
+#ifndef __FFTS_SMALL_H__
+#define __FFTS_SMALL_H__
+
+
+void firstpass_16_f(ffts_plan_t *  p, const void *  in, void *  out);
+void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out);
+void firstpass_8_f(ffts_plan_t *  p, const void *  in, void *  out);
+void firstpass_8_b(ffts_plan_t *  p, const void *  in, void *  out);
+void firstpass_4_f(ffts_plan_t *  p, const void *  in, void *  out);
+void firstpass_4_b(ffts_plan_t *  p, const void *  in, void *  out);
+void firstpass_2(ffts_plan_t *  p, const void *  in, void *  out);
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_static.c
+++ b/3rdparty/ffts/ffts-master/src/ffts_static.c
@@ -0,0 +1,102 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "ffts_static.h"
+
+void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) {
+	if(N > 16) {
+		size_t N1 = N >> 1;
+		size_t N2 = N >> 2;
+		size_t N3 = N >> 3;
+		float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
+
+		ffts_static_rec_i(p, data, N2);
+		ffts_static_rec_i(p, data + N1, N3);
+		ffts_static_rec_i(p, data + N1 + N2, N3);
+		ffts_static_rec_i(p, data + N, N2);
+		ffts_static_rec_i(p, data + N + N1, N2);
+
+  	if(N == p->N) {
+  		neon_static_x8_t_i(data, N, ws);
+  	}else{
+  		neon_static_x8_i(data, N, ws); 
+  	}
+
+	}else if(N==16){
+	  neon_static_x4_i(data, N, p->ws);
+	}
+
+}
+void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) {
+	if(N > 16) {
+		size_t N1 = N >> 1;
+		size_t N2 = N >> 2;
+		size_t N3 = N >> 3;
+		float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
+
+		ffts_static_rec_f(p, data, N2);
+		ffts_static_rec_f(p, data + N1, N3);
+		ffts_static_rec_f(p, data + N1 + N2, N3);
+		ffts_static_rec_f(p, data + N, N2);
+		ffts_static_rec_f(p, data + N + N1, N2);
+
+  	if(N == p->N) {
+  		neon_static_x8_t_f(data, N, ws);
+  	}else{
+  		neon_static_x8_f(data, N, ws); 
+  	}
+
+	}else if(N==16){
+	  neon_static_x4_f(data, N, p->ws);
+	}
+
+}
+
+void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out) {
+
+	if(__builtin_ctzl(p->N) & 1) 
+		neon_static_o_f(p, in, out);
+	else 
+		neon_static_e_f(p, in, out);
+	ffts_static_rec_f(p, out, p->N);
+}
+
+
+void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out) {
+
+	if(__builtin_ctzl(p->N) & 1) 
+		neon_static_o_i(p, in, out);
+	else 
+		neon_static_e_i(p, in, out);
+	ffts_static_rec_i(p, out, p->N);
+}
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/ffts_static.h
+++ b/3rdparty/ffts/ffts-master/src/ffts_static.h
@@ -0,0 +1,47 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __FFTS_STATIC_H__
+#define __FFTS_STATIC_H__
+
+#include "ffts.h"
+#include "neon.h"
+
+void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ;
+void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out);
+
+void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ;
+void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out);
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/macros-alpha.h
+++ b/3rdparty/ffts/ffts-master/src/macros-alpha.h
@@ -0,0 +1,207 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_ALPHA_H__
+#define __MACROS_ALPHA_H__
+
+#include <math.h>
+
+#ifdef __alpha__
+#define restrict
+#endif
+
+typedef struct {float r1, i1, r2, i2;} V;
+
+#define FFTS_MALLOC(d,a) malloc(d)
+#define FFTS_FREE(d) free(d)
+
+#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3})
+
+static inline V VADD(V x, V y)
+{
+    V z;
+    z.r1 = x.r1 + y.r1;
+    z.i1 = x.i1 + y.i1;
+    z.r2 = x.r2 + y.r2;
+    z.i2 = x.i2 + y.i2;
+    return z;
+}
+
+
+static inline V VSUB(V x, V y)
+{
+    V z;
+    z.r1 = x.r1 - y.r1;
+    z.i1 = x.i1 - y.i1;
+    z.r2 = x.r2 - y.r2;
+    z.i2 = x.i2 - y.i2;
+    return z;
+}
+
+
+static inline V VMUL(V x, V y)
+{
+    V z;
+    z.r1 = x.r1 * y.r1;
+    z.i1 = x.i1 * y.i1;
+    z.r2 = x.r2 * y.r2;
+    z.i2 = x.i2 * y.i2;
+    return z;
+}
+
+static inline V VXOR(V x, V y)
+{
+    V r;
+    r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1;
+    r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1;
+    r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2;
+    r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2;
+    return r;
+}
+
+static inline V VSWAPPAIRS(V x)
+{
+    V z;
+    z.r1 = x.i1;
+    z.i1 = x.r1;
+    z.r2 = x.i2;
+    z.i2 = x.r2;
+    return z;
+}
+
+
+static inline V VBLEND(V x, V y)
+{
+    V z;
+    z.r1 = x.r1;
+    z.i1 = x.i1;
+    z.r2 = y.r2;
+    z.i2 = y.i2;
+    return z;
+}
+
+static inline V VUNPACKHI(V x, V y)
+{
+    V z;
+    z.r1 = x.r2;
+    z.i1 = x.i2;
+    z.r2 = y.r2;
+    z.i2 = y.i2;
+    return z;
+}
+
+static inline V VUNPACKLO(V x, V y)
+{
+    V z;
+    z.r1 = x.r1;
+    z.i1 = x.i1;
+    z.r2 = y.r1;
+    z.i2 = y.i1;
+    return z;
+}
+
+static inline V VDUPRE(V x)
+{
+    V z;
+    z.r1 = x.r1;
+    z.i1 = x.r1;
+    z.r2 = x.r2;
+    z.i2 = x.r2;
+    return z;
+}
+
+static inline V VDUPIM(V x)
+{
+    V z;
+    z.r1 = x.i1;
+    z.i1 = x.i1;
+    z.r2 = x.i2;
+    z.i2 = x.i2;
+    return z;
+}
+
+static inline V IMUL(V d, V re, V im)
+{
+    re = VMUL(re, d);
+    im = VMUL(im, VSWAPPAIRS(d));
+    return VSUB(re, im);  
+}
+
+
+static inline V IMULJ(V d, V re, V im)
+{
+    re = VMUL(re, d);
+    im = VMUL(im, VSWAPPAIRS(d));
+    return VADD(re, im);
+}
+
+static inline V MULI(int inv, V x)
+{
+    V z;
+
+    if (inv) {
+	z.r1 = -x.r1;
+	z.i1 = x.i1;
+	z.r2 = -x.r2;
+	z.i2 = x.i2;
+    }else{
+	z.r1 = x.r1;
+	z.i1 = -x.i1;
+	z.r2 = x.r2;
+	z.i2 = -x.i2;
+    }
+    return z;
+}
+
+
+static inline V IMULI(int inv, V x)
+{
+    return VSWAPPAIRS(MULI(inv, x));
+}
+
+
+static inline V VLD(const void *s)
+{
+    V *d = (V *)s;
+    return *d;
+}
+
+
+static inline void VST(void *d, V s)
+{
+    V *r = (V *)d;
+    *r = s;
+}
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/macros-altivec.h
+++ b/3rdparty/ffts/ffts-master/src/macros-altivec.h
@@ -0,0 +1,138 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_ALTIVEC_H__
+#define __MACROS_ALTIVEC_H__
+
+#include <math.h>
+#include <altivec.h>
+
+#define restrict
+
+typedef vector float V;
+typedef vector unsigned char VUC;
+
+#ifdef __apple__
+#define FFTS_MALLOC(d,a) vec_malloc(d)
+#define FFTS_FREE(d) vec_free(d)
+#else
+/* It appears vec_malloc() and friends are not implemented on Linux */
+#include <malloc.h>
+#define FFTS_MALLOC(d,a) memalign(16,d)
+#define FFTS_FREE(d) free(d)
+#endif
+
+#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
+
+#define VADD(x,y) vec_add(x,y)
+#define VSUB(x,y) vec_sub(x,y)
+#define VMUL(x,y) vec_madd(x,y,(V){0})
+#define VMULADD(x,y,z) vec_madd(x,y,z)
+#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
+#define VXOR(x,y) vec_xor((x),(y))
+#define VSWAPPAIRS(x)						\
+    vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03,	\
+		       0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
+
+#define VBLEND(x,y)						\
+    vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,	\
+		       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
+
+#define VUNPACKHI(x,y)						\
+    vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,	\
+		       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
+
+#define VUNPACKLO(x,y)						\
+    vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,	\
+		       0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
+
+#define VDUPRE(x)						\
+    vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,	\
+		       0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
+
+#define VDUPIM(x)						\
+    vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,	\
+		       0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
+
+
+static inline V IMUL(V d, V re, V im)
+{
+    im = VMUL(im, VSWAPPAIRS(d));
+    re = VMUL(re, d);
+    return VSUB(re, im);  
+}
+
+
+static inline V IMULJ(V d, V re, V im)
+{
+    im = VMUL(im, VSWAPPAIRS(d));
+    return VMULADD(re, d, im);
+}
+
+#ifndef __GNUC__
+/* gcc (4.6 and 4.7) ICEs on this code! */
+static inline V MULI(int inv, V x)
+{
+    return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
+}
+#else
+/* but compiles this fine... */
+static inline V MULI(int inv, V x)
+{
+    V t;
+    t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
+    return VXOR(x, t);
+}
+#endif
+
+
+static inline V IMULI(int inv, V x)
+{
+    return VSWAPPAIRS(MULI(inv, x));
+}
+
+
+static inline V VLD(const void *s)
+{
+    V *d = (V *)s;
+    return *d;
+}
+
+
+static inline void VST(void *d, V s)
+{
+    V *r = (V *)d;
+    *r = s;
+}
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/macros-neon.h
+++ b/3rdparty/ffts/ffts-master/src/macros-neon.h
@@ -0,0 +1,97 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef __MACROS_NEON_H__
+#define __MACROS_NEON_H__
+
+#include "neon.h"
+#include <arm_neon.h>
+
+typedef float32x4_t V;
+
+typedef float32x4x2_t VS;
+
+#define ADD vaddq_f32
+#define SUB vsubq_f32
+#define MUL vmulq_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
+#define VST vst1q_f32
+#define VLD vld1q_f32 
+#define VST2 vst2q_f32
+#define VLD2 vld2q_f32 
+
+#define VSWAPPAIRS(x) (vrev64q_f32(x))
+
+#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
+#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
+
+#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
+
+__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
+    data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
+    return VLD(d);
+}
+
+#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
+#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+
+#define FFTS_MALLOC(d,a) (valloc(d))
+#define FFTS_FREE(d) (free(d))
+
+__INLINE void STORESPR(data_t * addr,  VS p) {
+
+	vst1q_f32(addr, p.val[0]);
+	vst1q_f32(addr + 4, p.val[1]);
+	
+}
+
+__INLINE V IMULI(int inv, V a) {
+	if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+	else    return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+}
+
+__INLINE V IMUL(V d, V re, V im) {
+  re = VMUL(re, d);                   
+  im = VMUL(im, VSWAPPAIRS(d));
+  return VSUB(re, im);  
+}
+
+__INLINE V IMULJ(V d, V re, V im) {
+  re = VMUL(re, d);                   
+  im = VMUL(im, VSWAPPAIRS(d));
+  return VADD(re, im);  
+}
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/macros-sse.h
+++ b/3rdparty/ffts/ffts-master/src/macros-sse.h
@@ -0,0 +1,85 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __SSE_FLOAT_H__
+#define __SSE_FLOAT_H__
+
+#include <xmmintrin.h>
+
+//#define VL 4
+
+typedef __m128 V;
+
+#define VADD _mm_add_ps
+#define VSUB _mm_sub_ps
+#define VMUL _mm_mul_ps
+//#define VLIT4 _mm_set_ps
+#define VXOR _mm_xor_ps
+#define VST _mm_store_ps
+#define VLD _mm_load_ps
+
+#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)))
+
+#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2)))
+#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0)))
+
+#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0)))
+
+#define VLIT4 _mm_set_ps
+
+#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0)))
+#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1)))
+
+#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
+#define FFTS_FREE(d) (_mm_free(d))
+
+__INLINE V IMULI(int inv, V a) {
+	if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+	else    return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+}
+
+
+__INLINE V IMUL(V d, V re, V im) {
+  re = VMUL(re, d);                   
+  im = VMUL(im, VSWAPPAIRS(d));
+  return VSUB(re, im);  
+}
+
+__INLINE V IMULJ(V d, V re, V im) {
+  re = VMUL(re, d);                   
+  im = VMUL(im, VSWAPPAIRS(d));
+  return VADD(re, im);  
+}
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/macros.h
+++ b/3rdparty/ffts/ffts-master/src/macros.h
@@ -0,0 +1,162 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
+ Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __MACROS_H__
+#define __MACROS_H__
+
+#ifdef HAVE_NEON
+#include "macros-neon.h"
+#else
+#ifdef __alpha__
+#include "macros-alpha.h"
+#else
+#ifdef __powerpc__
+#include "macros-altivec.h"
+#endif
+#endif
+
+#endif
+
+
+#ifdef HAVE_VFP
+#include "macros-alpha.h"
+#endif
+#ifdef HAVE_SSE
+	#include "macros-sse.h"
+#endif
+
+static inline void TX2(V *a, V *b)
+{
+    V TX2_t0 = VUNPACKLO(*a, *b);
+    V TX2_t1 = VUNPACKHI(*a, *b);
+    *a = TX2_t0; *b = TX2_t1; 
+}
+
+static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+{
+    V uk, uk2, zk_p, zk_n, zk, zk_d;
+    uk   = *r0; uk2  = *r1;
+    zk_p = IMUL(*r2, re, im);
+    zk_n = IMULJ(*r3, re, im);
+
+    zk   = VADD(zk_p, zk_n);
+    zk_d = IMULI(inv, VSUB(zk_p, zk_n));
+  
+    *r2 = VSUB(uk, zk);
+    *r0 = VADD(uk, zk);
+    *r3 = VADD(uk2, zk_d);
+    *r1 = VSUB(uk2, zk_d);
+}
+
+
+static inline void S_4(V r0, V r1, V r2, V r3, 
+		       data_t * restrict o0, data_t * restrict o1,
+		       data_t * restrict o2, data_t * restrict o3)
+{
+    VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
+}
+
+
+static inline void L_2_4(int inv, 
+			 const data_t * restrict i0, const data_t * restrict i1,
+			 const data_t * restrict i2, const data_t * restrict i3,
+			 V *r0, V *r1, V *r2, V *r3)
+{
+    V t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
+    t4 = VADD(t0, t1);
+    t5 = VSUB(t0, t1);
+    t6 = VADD(t2, t3);
+    t7 = VSUB(t2, t3);
+    *r0 = VUNPACKLO(t4, t5);
+    *r1 = VUNPACKLO(t6, t7);
+    t5 = IMULI(inv, t5);
+    t0 = VADD(t6, t4);
+    t2 = VSUB(t6, t4);
+    t1 = VSUB(t7, t5);
+    t3 = VADD(t7, t5);
+    *r3 = VUNPACKHI(t0, t1);
+    *r2 = VUNPACKHI(t2, t3);
+}
+
+
+static inline void L_4_4(int inv,  
+			 const data_t * restrict i0, const data_t * restrict i1,
+			 const data_t * restrict i2, const data_t * restrict i3,
+			 V *r0, V *r1, V *r2, V *r3)
+{
+    V t0, t1, t2, t3, t4, t5, t6, t7;
+ 
+    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);   
+    t4 = VADD(t0, t1);
+    t5 = VSUB(t0, t1);
+    t6 = VADD(t2, t3);
+    t7 = IMULI(inv, VSUB(t2, t3));
+    t0 = VADD(t4, t6);
+    t2 = VSUB(t4, t6);
+    t1 = VSUB(t5, t7);
+    t3 = VADD(t5, t7);
+    TX2(&t0, &t1);
+    TX2(&t2, &t3);
+    *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; 
+}
+
+
+
+static inline void L_4_2(int inv,  
+			 const data_t * restrict i0, const data_t * restrict i1,
+			 const data_t * restrict i2, const data_t * restrict i3,
+			 V *r0, V *r1, V *r2, V *r3)
+{
+    V t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = VLD(i0);    t1 = VLD(i1);    t6 = VLD(i2);    t7 = VLD(i3);
+    t2 = VBLEND(t6, t7);
+    t3 = VBLEND(t7, t6);
+    t4 = VADD(t0, t1);
+    t5 = VSUB(t0, t1);
+    t6 = VADD(t2, t3);
+    t7 = VSUB(t2, t3);
+    *r2 = VUNPACKHI(t4, t5);
+    *r3 = VUNPACKHI(t6, t7); 
+    t7 = IMULI(inv, t7);
+    t0 = VADD(t4, t6);
+    t2 = VSUB(t4, t6);
+    t1 = VSUB(t5, t7);
+    t3 = VADD(t5, t7);
+    *r0 = VUNPACKLO(t0, t1);
+    *r1 = VUNPACKLO(t2, t3);
+}
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/neon.h
+++ b/3rdparty/ffts/ffts-master/src/neon.h
@@ -0,0 +1,66 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __NEON_H__
+#define __NEON_H__
+
+#include "ffts.h"
+
+void neon_x4(float *, size_t, float *);
+void neon_x8(float *, size_t, float *);
+void neon_x8_t(float *, size_t, float *);
+void neon_ee();
+void neon_oo();
+void neon_eo();
+void neon_oe();
+void neon_end();
+
+void neon_transpose(uint64_t *in, uint64_t *out, int w, int h); 
+void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w); 
+
+//typedef struct _ffts_plan_t ffts_plan_t;
+
+void neon_static_e_f(ffts_plan_t * , const void * , void * );
+void neon_static_o_f(ffts_plan_t * , const void * , void * );
+void neon_static_x4_f(float *, size_t, float *);
+void neon_static_x8_f(float *, size_t, float *);
+void neon_static_x8_t_f(float *, size_t, float *);
+
+void neon_static_e_i(ffts_plan_t * , const void * , void * );
+void neon_static_o_i(ffts_plan_t * , const void * , void * );
+void neon_static_x4_i(float *, size_t, float *);
+void neon_static_x8_i(float *, size_t, float *);
+void neon_static_x8_t_i(float *, size_t, float *);
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/neon.s
+++ b/3rdparty/ffts/ffts-master/src/neon.s
@@ -0,0 +1,738 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+	.align	4
+#ifdef __APPLE__
+	.globl	_neon_x4
+_neon_x4:
+#else
+	.globl	neon_x4
+neon_x4:
+#endif
+@	add r3, r0, #0
+
+	vld1.32 {q8,q9}, [r0, :128]
+	add r4, r0, r1, lsl #1
+	vld1.32 {q10,q11}, [r4, :128]
+	add r5, r0, r1, lsl #2
+	vld1.32 {q12,q13}, [r5, :128]
+	add r6, r4, r1, lsl #2
+	vld1.32 {q14,q15}, [r6, :128]
+	vld1.32 {q2,q3}, [r2, :128]
+	
+	vmul.f32	q0, q13, q3
+	vmul.f32	q5, q12, q2
+	vmul.f32	q1, q14, q2
+	vmul.f32	q4, q14, q3
+	vmul.f32	q14, q12, q3
+	vmul.f32	q13, q13, q2
+	vmul.f32	q12, q15, q3
+	vmul.f32	q2, q15, q2
+	vsub.f32	q0, q5, q0
+	vadd.f32	q13, q13, q14
+	vadd.f32	q12, q12, q1
+	vsub.f32	q1, q2, q4
+	vadd.f32	q15, q0, q12
+	vsub.f32	q12, q0, q12
+	vadd.f32	q14, q13, q1
+	vsub.f32	q13, q13, q1
+	vadd.f32	q0, q8, q15
+	vadd.f32	q1, q9, q14
+	vsub.f32	q2, q10, q13  @
+	vsub.f32	q4, q8, q15
+	vadd.f32	q3, q11, q12  @
+	vst1.32 {q0,q1}, [r0, :128]
+	vsub.f32	q5, q9, q14
+	vadd.f32	q6, q10, q13  @
+	vsub.f32	q7, q11, q12  @
+	vst1.32 {q2,q3}, [r4, :128]
+	vst1.32 {q4,q5}, [r5, :128]
+	vst1.32 {q6,q7}, [r6, :128]
+	bx lr
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_x8
+_neon_x8:
+#else
+	.globl	neon_x8
+neon_x8:
+#endif
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #5
+neon_x8_loop: 
+  vld1.32 {q2,q3}, [r12, :128]!
+  vld1.32 {q14,q15}, [r6, :128]
+  vld1.32 {q10,q11}, [r5, :128]
+  adds	r11, r11, #1
+  vmul.f32	q12, q15, q2
+  vmul.f32	q8, q14, q3
+  vmul.f32	q13, q14, q2
+  vmul.f32	q9, q10, q3
+  vmul.f32	q1, q10, q2
+  vmul.f32	q0, q11, q2
+  vmul.f32	q14, q11, q3
+  vmul.f32	q15, q15, q3
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q10, q12, q8
+  vadd.f32	q11, q0, q9
+  vadd.f32	q8, q15, q13
+  vld1.32 {q12,q13}, [r4, :128]
+  vsub.f32	q9, q1, q14
+  vsub.f32	q15, q11, q10
+  vsub.f32	q14, q9, q8
+  vsub.f32	q4, q12, q15  @
+  vadd.f32	q6, q12, q15  @ 
+  vadd.f32	q5, q13, q14  @
+  vsub.f32	q7, q13, q14  @
+  vld1.32 {q14,q15}, [r9, :128]
+  vld1.32 {q12,q13}, [r7, :128]
+  vmul.f32	q1, q14, q2
+  vmul.f32	q0, q14, q3
+  vst1.32 {q4,q5}, [r4, :128]
+  vmul.f32	q14, q15, q3
+  vmul.f32	q4, q15, q2
+  vadd.f32	q15, q9, q8
+  vst1.32 {q6,q7}, [r6, :128]
+  vmul.f32	q8, q12, q3
+  vmul.f32	q5, q13, q3
+  vmul.f32	q12, q12, q2
+  vmul.f32	q9, q13, q2
+  vadd.f32	q14, q14, q1
+  vsub.f32	q13, q4, q0
+  vadd.f32	q0, q9, q8
+  vld1.32 {q8,q9}, [r3, :128]
+  vadd.f32	q1, q11, q10
+  vsub.f32	q12, q12, q5
+  vadd.f32	q11, q8, q15
+  vsub.f32	q8, q8, q15
+  vadd.f32	q2, q12, q14
+  vsub.f32	q10, q0, q13
+  vadd.f32	q15, q0, q13
+  vadd.f32	q13, q9, q1
+  vsub.f32	q9, q9, q1
+  vsub.f32	q12, q12, q14
+  vadd.f32	q0, q11, q2
+  vadd.f32	q1, q13, q15
+  vsub.f32	q4, q11, q2
+  vsub.f32	q2, q8, q10  @
+  vadd.f32	q3, q9, q12  @
+  vst1.32 {q0,q1}, [r3, :128]!
+  vsub.f32	q5, q13, q15
+  vld1.32 {q14,q15}, [r10, :128]
+  vsub.f32	q7, q9, q12  @
+  vld1.32 {q12,q13}, [r8, :128]
+  vst1.32 {q2,q3}, [r5, :128]!
+  vld1.32 {q2,q3}, [r12, :128]!
+  vadd.f32	q6, q8, q10  @
+  vmul.f32	q8, q14, q2
+  vst1.32 {q4,q5}, [r7, :128]!
+  vmul.f32	q10, q15, q3
+  vmul.f32	q9, q13, q3
+  vmul.f32	q11, q12, q2
+  vmul.f32	q14, q14, q3
+  vst1.32 {q6,q7}, [r9, :128]!
+  vmul.f32	q15, q15, q2
+  vmul.f32	q12, q12, q3
+  vmul.f32	q13, q13, q2
+  vadd.f32	q10, q10, q8
+  vsub.f32	q11, q11, q9
+  vld1.32 {q8,q9}, [r4, :128]
+  vsub.f32	q14, q15, q14
+  vadd.f32	q15, q13, q12
+  vadd.f32	q13, q11, q10
+  vadd.f32	q12, q15, q14
+  vsub.f32	q15, q15, q14
+  vsub.f32	q14, q11, q10
+  vld1.32 {q10,q11}, [r6, :128]
+  vadd.f32	q0, q8, q13
+  vadd.f32	q1, q9, q12
+  vsub.f32	q2, q10, q15  @
+  vadd.f32	q3, q11, q14  @
+  vsub.f32	q4, q8, q13
+  vst1.32 {q0,q1}, [r4, :128]!
+  vsub.f32	q5, q9, q12
+  vadd.f32	q6, q10, q15  @
+  vst1.32 {q2,q3}, [r6, :128]!
+  vsub.f32	q7, q11, q14  @
+  vst1.32 {q4,q5}, [r8, :128]!
+  vst1.32 {q6,q7}, [r10, :128]!
+	bne neon_x8_loop	
+
+	bx lr
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_x8_t
+_neon_x8_t:
+#else
+	.globl	neon_x8_t
+neon_x8_t:
+#endif
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #5
+neon_x8_t_loop: 
+  vld1.32 {q2,q3}, [r12, :128]!
+  vld1.32 {q14,q15}, [r6, :128]
+  vld1.32 {q10,q11}, [r5, :128]
+  adds	r11, r11, #1
+  vmul.f32	q12, q15, q2
+  vmul.f32	q8, q14, q3
+  vmul.f32	q13, q14, q2
+  vmul.f32	q9, q10, q3
+  vmul.f32	q1, q10, q2
+  vmul.f32	q0, q11, q2
+  vmul.f32	q14, q11, q3
+  vmul.f32	q15, q15, q3
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q10, q12, q8
+  vadd.f32	q11, q0, q9
+  vadd.f32	q8, q15, q13
+  vld1.32 {q12,q13}, [r4, :128]
+  vsub.f32	q9, q1, q14
+  vsub.f32	q15, q11, q10
+  vsub.f32	q14, q9, q8
+  vsub.f32	q4, q12, q15  @
+  vadd.f32	q6, q12, q15  @
+  vadd.f32	q5, q13, q14  @
+  vsub.f32	q7, q13, q14  @
+  vld1.32 {q14,q15}, [r9, :128]
+  vld1.32 {q12,q13}, [r7, :128]
+  vmul.f32	q1, q14, q2
+  vmul.f32	q0, q14, q3
+  vst1.32 {q4,q5}, [r4, :128]
+  vmul.f32	q14, q15, q3
+  vmul.f32	q4, q15, q2
+  vadd.f32	q15, q9, q8
+  vst1.32 {q6,q7}, [r6, :128]
+  vmul.f32	q8, q12, q3
+  vmul.f32	q5, q13, q3
+  vmul.f32	q12, q12, q2
+  vmul.f32	q9, q13, q2
+  vadd.f32	q14, q14, q1
+  vsub.f32	q13, q4, q0
+  vadd.f32	q0, q9, q8
+  vld1.32 {q8,q9}, [r3, :128]
+  vadd.f32	q1, q11, q10
+  vsub.f32	q12, q12, q5
+  vadd.f32	q11, q8, q15
+  vsub.f32	q8, q8, q15
+  vadd.f32	q2, q12, q14
+  vsub.f32	q10, q0, q13
+  vadd.f32	q15, q0, q13
+  vadd.f32	q13, q9, q1
+  vsub.f32	q9, q9, q1
+  vsub.f32	q12, q12, q14
+  vadd.f32	q0, q11, q2
+  vadd.f32	q1, q13, q15
+  vsub.f32	q4, q11, q2
+  vsub.f32	q2, q8, q10  @
+  vadd.f32	q3, q9, q12  @
+  vst2.32 {q0,q1}, [r3, :128]!
+  vsub.f32	q5, q13, q15
+  vld1.32 {q14,q15}, [r10, :128]
+  vsub.f32	q7, q9, q12  @
+  vld1.32 {q12,q13}, [r8, :128]
+  vst2.32 {q2,q3}, [r5, :128]!
+  vld1.32 {q2,q3}, [r12, :128]!
+  vadd.f32	q6, q8, q10  @
+  vmul.f32	q8, q14, q2
+  vst2.32 {q4,q5}, [r7, :128]!
+  vmul.f32	q10, q15, q3
+  vmul.f32	q9, q13, q3
+  vmul.f32	q11, q12, q2
+  vmul.f32	q14, q14, q3
+  vst2.32 {q6,q7}, [r9, :128]!
+  vmul.f32	q15, q15, q2
+  vmul.f32	q12, q12, q3
+  vmul.f32	q13, q13, q2
+  vadd.f32	q10, q10, q8
+  vsub.f32	q11, q11, q9
+  vld1.32 {q8,q9}, [r4, :128]
+  vsub.f32	q14, q15, q14
+  vadd.f32	q15, q13, q12
+  vadd.f32	q13, q11, q10
+  vadd.f32	q12, q15, q14
+  vsub.f32	q15, q15, q14
+  vsub.f32	q14, q11, q10
+  vld1.32 {q10,q11}, [r6, :128]
+  vadd.f32	q0, q8, q13
+  vadd.f32	q1, q9, q12
+  vsub.f32	q2, q10, q15  @
+  vadd.f32	q3, q11, q14  @
+  vsub.f32	q4, q8, q13
+  vst2.32 {q0,q1}, [r4, :128]!
+  vsub.f32	q5, q9, q12
+  vadd.f32	q6, q10, q15  @
+  vst2.32 {q2,q3}, [r6, :128]!
+  vsub.f32	q7, q11, q14  @
+  vst2.32 {q4,q5}, [r8, :128]!
+  vst2.32 {q6,q7}, [r10, :128]!
+	bne neon_x8_t_loop	
+
+	@bx lr
+
+@ assumes r0 = out 
+@         r1 = in ? 
+@
+@         r12 = offsets
+@         r3-r10 = data pointers
+@         r11 = loop iterations
+@         r2 & lr = temps
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_ee
+_neon_ee:
+#else
+	.globl	neon_ee
+neon_ee:
+#endif
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_loop:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vadd.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vsub.f32	d31, d5, d2  @
+	vsub.f32	d28, d4, d3  @
+	vadd.f32	d30, d4, d3  @
+	vadd.f32	d5, d19, d14  @-
+	vadd.f32	d7, d31, d26  @-
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vsub.f32	d6, d30, d27  @-
+	vsub.f32	d4, d18, d15  @-
+	vsub.f32	d13, d19, d14  @-
+	vadd.f32	d12, d18, d15  @-
+	vsub.f32	d15, d31, d26  @-
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vadd.f32	d14, d30, d27 @-
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_loop
+
+@ assumes r0 = out 
+@         
+@         r12 = offsets
+@         r3-r10 = data pointers
+@         r11 = loop iterations
+@         r2 & lr = temps
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_oo
+_neon_oo:
+#else
+	.globl	neon_oo
+neon_oo:
+#endif
+_neon_oo_loop:
+	vld2.32 {q8}, [r6, :128]!
+	vld2.32 {q9}, [r5, :128]!
+	vld2.32 {q10}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vadd.f32	q11, q9, q8
+	vsub.f32	q8, q9, q8
+	vsub.f32	q9, q13, q10
+	vadd.f32	q12, q13, q10
+	subs	r11, r11, #1
+	vld2.32 {q10}, [r7, :128]!
+	vld2.32 {q13}, [r9, :128]!
+	vsub.f32	q2, q12, q11
+	vsub.f32	d7, d19, d16  @
+	vadd.f32	d3, d19, d16  @
+	vadd.f32	d6, d18, d17  @
+	vsub.f32	d2, d18, d17  @
+	vld2.32 {q9}, [r8, :128]!
+	vld2.32 {q8}, [r10, :128]!
+	vadd.f32	q0, q12, q11
+	vadd.f32	q11, q13, q8
+	vadd.f32	q12, q10, q9
+	vsub.f32	q8, q13, q8
+	vsub.f32	q9, q10, q9
+	vsub.f32	q6, q12, q11
+	vadd.f32	q4, q12, q11
+	vtrn.32	q0, q2
+	ldr r2, [r12], #4
+	vsub.f32	d15, d19, d16  @
+	ldr lr, [r12], #4
+	vadd.f32	d11, d19, d16  @
+	vadd.f32	d14, d18, d17  @
+	vsub.f32	d10, d18, d17  @
+	add r2, r0, r2, lsl #2
+	vtrn.32	q1, q3
+	add lr, r0, lr, lsl #2
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_oo_loop
+
+@ assumes r0 = out 
+@         
+@         r12 = offsets
+@         r3-r10 = data pointers
+@         r11 = addr of twiddle 
+@         r2 & lr = temps
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_eo
+_neon_eo:
+#else
+	.globl	neon_eo
+neon_eo:
+#endif
+	vld2.32 {q9}, [r5, :128]! @tag2
+	vld2.32 {q13}, [r3, :128]! @tag0
+	vld2.32 {q12}, [r4, :128]! @tag1
+	vld2.32 {q0}, [r7, :128]! @tag4
+	vsub.f32	q11, q13, q12
+	vld2.32 {q8}, [r6, :128]! @tag3
+	vadd.f32	q12, q13, q12
+	vsub.f32	q10, q9, q8
+	vadd.f32	q8, q9, q8
+	vadd.f32	q9, q12, q8
+	vadd.f32	d9, d23, d20  @
+	vsub.f32	d11, d23, d20  @
+	vsub.f32	q8, q12, q8
+	vsub.f32	d8, d22, d21  @
+	vadd.f32	d10, d22, d21  @
+	ldr r2, [r12], #4
+	vld1.32	{d20, d21}, [r11, :128]
+	ldr lr, [r12], #4
+	vtrn.32	q9, q4
+	add r2, r0, r2, lsl #2
+	vtrn.32	q8, q5
+	add lr, r0, lr, lsl #2
+	vswp d9,d10
+	vst1.32 {d8,d9,d10,d11}, [lr, :128]!
+	vld2.32 {q13}, [r10, :128]! @tag7
+	vld2.32 {q15}, [r9, :128]! @tag6
+	vld2.32 {q11}, [r8, :128]! @tag5
+	vsub.f32	q14, q15, q13
+	vsub.f32	q12, q0, q11
+	vadd.f32	q11, q0, q11
+	vadd.f32	q13, q15, q13
+	vadd.f32	d13, d29, d24  @
+	vadd.f32	q15, q13, q11
+	vsub.f32	d12, d28, d25  @
+	vsub.f32	d15, d29, d24  @
+	vadd.f32	d14, d28, d25  @
+	vtrn.32	q15, q6
+	vsub.f32	q15, q13, q11
+	vtrn.32	q15, q7
+	vswp d13, d14
+	vst1.32 {d12,d13,d14,d15}, [lr, :128]!
+	vtrn.32	q13, q14
+	vtrn.32	q11, q12
+	vmul.f32	d24, d26, d21
+	vmul.f32	d28, d27, d20
+	vmul.f32	d25, d26, d20
+	vmul.f32	d26, d27, d21
+	vmul.f32	d27, d22, d21
+	vmul.f32	d30, d23, d20
+	vmul.f32	d29, d23, d21
+	vmul.f32	d22, d22, d20
+	vsub.f32	d21, d28, d24
+	vadd.f32	d20, d26, d25
+	vadd.f32	d25, d30, d27
+	vsub.f32	d24, d22, d29
+	vadd.f32	q11, q12, q10
+	vsub.f32	q10, q12, q10
+	vadd.f32	q0, q9, q11
+	vsub.f32	q2, q9, q11
+	vadd.f32	d3, d17, d20  @
+	vsub.f32	d7, d17, d20  @
+	vsub.f32	d2, d16, d21  @
+	vadd.f32	d6, d16, d21  @
+	vswp d1, d2
+	vswp d5, d6
+	vstmia r2!, {q0-q3}
+	
+
+@ assumes r0 = out 
+@       
+@         r12 = offsets
+@         r3-r10 = data pointers
+@         r11 = addr of twiddle 
+@         r2 & lr = temps
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_oe
+_neon_oe:
+#else
+	.globl	neon_oe
+neon_oe:
+#endif
+	vld1.32 {q8}, [r5, :128]!
+	vld1.32 {q10}, [r6, :128]!
+	vld2.32 {q11}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vld2.32 {q15}, [r10, :128]!
+	vorr	d25, d17, d17
+	vorr	d24, d20, d20
+	vorr	d20, d16, d16
+	vsub.f32	q9, q13, q11
+	vadd.f32	q11, q13, q11
+	ldr r2, [r12], #4
+	vtrn.32	d24, d25
+	ldr lr, [r12], #4
+	vtrn.32	d20, d21
+	add r2, r0, r2, lsl #2
+	vsub.f32	q8, q10, q12
+	add lr, r0, lr, lsl #2
+	vadd.f32	q10, q10, q12
+	vadd.f32	q0, q11, q10
+	vadd.f32	d25, d19, d16  @
+	vsub.f32	d27, d19, d16  @
+	vsub.f32	q1, q11, q10
+	vsub.f32	d24, d18, d17  @
+	vadd.f32	d26, d18, d17  @
+	vtrn.32	q0, q12
+	vtrn.32	q1, q13
+	vld1.32	{d24, d25}, [r11, :128]
+	vswp d1, d2
+	vst1.32 {q0, q1}, [r2, :128]!
+	vld2.32 {q0}, [r9, :128]!
+	vadd.f32	q1, q0, q15
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vsub.f32	q15, q0, q15
+	vsub.f32	q0, q14, q13
+	vadd.f32	q3, q14, q13
+	vadd.f32	q2, q3, q1
+	vadd.f32	d29, d1, d30  @
+	vsub.f32	d27, d1, d30  @
+	vsub.f32	q3, q3, q1
+	vsub.f32	d28, d0, d31  @
+	vadd.f32	d26, d0, d31  @
+	vtrn.32	q2, q14
+	vtrn.32	q3, q13
+	vswp d5, d6
+	vst1.32 {q2, q3}, [r2, :128]!
+	vtrn.32	q11, q9
+	vtrn.32	q10, q8
+	vmul.f32	d20, d18, d25
+	vmul.f32	d22, d19, d24
+	vmul.f32	d21, d19, d25
+	vmul.f32	d18, d18, d24
+	vmul.f32	d19, d16, d25
+	vmul.f32	d30, d17, d24
+	vmul.f32	d23, d16, d24
+	vmul.f32	d24, d17, d25
+	vadd.f32	d17, d22, d20
+	vsub.f32	d16, d18, d21
+	vsub.f32	d21, d30, d19
+	vadd.f32	d20, d24, d23
+	vadd.f32	q9, q8, q10
+	vsub.f32	q8, q8, q10
+	vadd.f32	q4, q14, q9
+	vsub.f32	q6, q14, q9
+	vadd.f32	d11, d27, d16  @
+	vsub.f32	d15, d27, d16  @
+	vsub.f32	d10, d26, d17  @
+	vadd.f32	d14, d26, d17  @
+	vswp d9, d10
+	vswp d13, d14
+	vstmia lr!, {q4-q7}
+	
+	
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_end
+_neon_end:
+#else
+	.globl	neon_end
+neon_end:
+#endif
+	bx lr
+
+
+	.align 4
+#ifdef __APPLE__
+	.globl _neon_transpose
+_neon_transpose:
+#else
+	.globl neon_transpose
+neon_transpose:
+#endif
+	push {r4-r8}
+	@ vpush {q8-q9}
+	mov r5, r3 
+_neon_transpose_col:
+	mov r7, r1
+	add r8, r1, r3, lsl #3
+	mov r4, r2
+	add r6, r0, r2, lsl #3
+_neon_transpose_row:
+	vld1.32 {q8,q9}, [r0, :128]!
+@	vld1.32 {q10,q11}, [r0, :128]!
+	vld1.32 {q12,q13}, [r6, :128]!
+@	vld1.32 {q14,q15}, [r6, :128]!
+	sub r4, r4, #4
+	cmp r4, #0
+	vswp d17,d24
+	vswp d19,d26
+	vswp d21,d28
+	vswp d23,d30
+	vst1.32 {q8}, [r7, :128]
+	vst1.32 {q12}, [r8, :128]
+	add r7, r7, r3, lsl #4
+	add r8, r8, r3, lsl #4
+	vst1.32 {q9}, [r7, :128]
+	vst1.32 {q13}, [r8, :128]
+	add r7, r7, r3, lsl #4
+	add r8, r8, r3, lsl #4
+@@vst1.32 {q10}, [r7, :128]
+@@vst1.32 {q14}, [r8, :128]
+@@add r7, r7, r3, lsl #4
+@@add r8, r8, r3, lsl #4
+@@vst1.32 {q11}, [r7, :128]
+@@vst1.32 {q15}, [r8, :128]
+@@add r7, r7, r3, lsl #4
+@@add r8, r8, r3, lsl #4
+	bne _neon_transpose_row
+	sub r5, r5, #2
+	cmp r5, #0
+	add r0, r0, r2, lsl #3
+	add r1, r1, #16
+	bne _neon_transpose_col
+	@ vpop {q8-q9}
+	pop {r4-r8}
+	bx lr
+
+	.align 4
+#ifdef __APPLE__
+	.globl _neon_transpose_to_buf
+_neon_transpose_to_buf:
+#else
+	.globl neon_transpose_to_buf
+neon_transpose_to_buf:
+#endif
+	push {r4-r10}
+	mov r5, #8 
+_neon_transpose_to_buf_col:
+	mov r4, #8
+	add r6, r0, r2, lsl #3
+	mov r7, r1
+	add r8, r1, #64 
+	add r9, r1, #128
+	add r10, r1, #192
+_neon_transpose_to_buf_row:
+	vld1.32 {q8,q9}, [r0, :128]!
+	vld1.32 {q12,q13}, [r6, :128]!
+	sub r4, r4, #4
+	cmp r4, #0
+	vswp d17,d24
+	vswp d19,d26
+	vst1.32 {q8}, [r7, :128]
+	vst1.32 {q12}, [r8, :128]
+	vst1.32 {q9}, [r9, :128]
+	vst1.32 {q13}, [r10, :128]
+	add r7, r7, #256 
+	add r8, r8, #256
+	add r9, r9, #256 
+	add r10, r10, #256
+	bne _neon_transpose_to_buf_row
+	sub r5, r5, #2
+	cmp r5, #0
+	sub r0, r0, #64
+	add r0, r0, r2, lsl #4
+	add r1, r1, #16
+	bne _neon_transpose_to_buf_col
+	pop {r4-r10}
+	bx lr
--- a/3rdparty/ffts/ffts-master/src/neon_float.h
+++ b/3rdparty/ffts/ffts-master/src/neon_float.h
--- a/3rdparty/ffts/ffts-master/src/neon_static_f.s
+++ b/3rdparty/ffts/ffts-master/src/neon_static_f.s
@@ -0,0 +1,956 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_e_f
+_neon_static_e_f:
+#else
+	.globl	neon_static_e_f
+neon_static_e_f:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+  ldr lr, [r0, #40]  @ this is p->N
+  add	r3, r1, #0	
+  add	r7, r1, lr 
+  add	r5, r7, lr 
+  add	r10, r5, lr
+  add	r4, r10, lr	
+  add	r8, r4, lr
+  add	r6, r8, lr
+  add	r9, r6, lr
+  ldr	r12, [r0]
+  add	r1, r0, #0
+  add	r0, r2, #0	
+  ldr	r2, [r1, #16]   @ this is p->ee_ws
+  ldr	r11, [r1, #28]  @ this is p->i0	
+	
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_loop:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vsub.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vadd.f32	d31, d5, d2  @
+	vadd.f32	d28, d4, d3  @
+	vsub.f32	d30, d4, d3  @
+	vsub.f32	d5, d19, d14  @
+	vsub.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vadd.f32	d6, d30, d27  @
+	vadd.f32	d4, d18, d15  @
+	vadd.f32	d13, d19, d14  @
+	vsub.f32	d12, d18, d15  @
+	vadd.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vsub.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_loop
+
+	ldr	r11, [r1, #12]
+	vld2.32 {q9}, [r5, :128]! @tag2
+	vld2.32 {q13}, [r3, :128]! @tag0
+	vld2.32 {q12}, [r4, :128]! @tag1
+	vld2.32 {q0}, [r7, :128]! @tag4
+	vsub.f32	q11, q13, q12
+	vld2.32 {q8}, [r6, :128]! @tag3
+	vadd.f32	q12, q13, q12
+	vsub.f32	q10, q9, q8
+	vadd.f32	q8, q9, q8
+	vadd.f32	q9, q12, q8
+	vsub.f32	d9, d23, d20  @
+	vadd.f32	d11, d23, d20  @
+	vsub.f32	q8, q12, q8
+	vadd.f32	d8, d22, d21  @
+	vsub.f32	d10, d22, d21  @
+	ldr r2, [r12], #4
+	vld1.32	{d20, d21}, [r11, :128]
+	ldr lr, [r12], #4
+	vtrn.32	q9, q4
+	add r2, r0, r2, lsl #2
+	vtrn.32	q8, q5
+	add lr, r0, lr, lsl #2
+	vswp d9,d10
+	vst1.32 {d8,d9,d10,d11}, [lr, :128]!
+	vld2.32 {q13}, [r10, :128]! @tag7
+	vld2.32 {q15}, [r9, :128]! @tag6
+	vld2.32 {q11}, [r8, :128]! @tag5
+	vsub.f32	q14, q15, q13
+	vsub.f32	q12, q0, q11
+	vadd.f32	q11, q0, q11
+	vadd.f32	q13, q15, q13
+	vsub.f32	d13, d29, d24  @
+	vadd.f32	q15, q13, q11
+	vadd.f32	d12, d28, d25  @
+	vadd.f32	d15, d29, d24  @
+	vsub.f32	d14, d28, d25  @
+	vtrn.32	q15, q6
+	vsub.f32	q15, q13, q11
+	vtrn.32	q15, q7
+	vswp d13, d14
+	vst1.32 {d12,d13,d14,d15}, [lr, :128]!
+	vtrn.32	q13, q14
+	vtrn.32	q11, q12
+	vmul.f32	d24, d26, d21
+	vmul.f32	d28, d27, d20
+	vmul.f32	d25, d26, d20
+	vmul.f32	d26, d27, d21
+	vmul.f32	d27, d22, d21
+	vmul.f32	d30, d23, d20
+	vmul.f32	d29, d23, d21
+	vmul.f32	d22, d22, d20
+	vsub.f32	d21, d28, d24
+	vadd.f32	d20, d26, d25
+	vadd.f32	d25, d30, d27
+	vsub.f32	d24, d22, d29
+	vadd.f32	q11, q12, q10
+	vsub.f32	q10, q12, q10
+	vadd.f32	q0, q9, q11
+	vsub.f32	q2, q9, q11
+	vsub.f32	d3, d17, d20  @
+	vadd.f32	d7, d17, d20  @
+	vadd.f32	d2, d16, d21  @
+	vsub.f32	d6, d16, d21  @
+	vswp d1, d2
+	vswp d5, d6
+	vstmia r2!, {q0-q3}
+
+  add	r2, r7, #0	
+  add	r7, r9, #0	
+  add	r9, r2, #0	
+  add	r2, r8, #0	
+  add	r8, r10, #0	
+  add	r10, r2, #0	
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_oo_loop_exit
+_neon_oo_loop:
+	vld2.32 {q8}, [r6, :128]!
+	vld2.32 {q9}, [r5, :128]!
+	vld2.32 {q10}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vadd.f32	q11, q9, q8
+	vsub.f32	q8, q9, q8
+	vsub.f32	q9, q13, q10
+	vadd.f32	q12, q13, q10
+	subs	r11, r11, #1
+	vld2.32 {q10}, [r7, :128]!
+	vld2.32 {q13}, [r9, :128]!
+	vsub.f32	q2, q12, q11
+	vadd.f32	d7, d19, d16  @
+	vsub.f32	d3, d19, d16  @
+	vsub.f32	d6, d18, d17  @
+	vadd.f32	d2, d18, d17  @
+	vld2.32 {q9}, [r8, :128]!
+	vld2.32 {q8}, [r10, :128]!
+	vadd.f32	q0, q12, q11
+	vadd.f32	q11, q13, q8
+	vadd.f32	q12, q10, q9
+	vsub.f32	q8, q13, q8
+	vsub.f32	q9, q10, q9
+	vsub.f32	q6, q12, q11
+	vadd.f32	q4, q12, q11
+	vtrn.32	q0, q2
+	ldr r2, [r12], #4
+	vadd.f32	d15, d19, d16  @
+	ldr lr, [r12], #4
+	vsub.f32	d11, d19, d16  @
+	vsub.f32	d14, d18, d17  @
+	vadd.f32	d10, d18, d17  @
+	add r2, r0, r2, lsl #2
+	vtrn.32	q1, q3
+	add lr, r0, lr, lsl #2
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_oo_loop
+_neon_oo_loop_exit:
+
+
+  add	r2, r3, #0
+  add	r3, r7, #0
+  add	r7, r2, #0
+  add	r2, r4, #0
+  add	r4, r8, #0
+  add	r8, r2, #0
+  add	r2, r5, #0
+  add	r5, r9, #0
+  add	r9, r2, #0
+  add	r2, r6, #0
+  add	r6, r10, #0
+  add	r10, r2, #0
+  add	r2, r9, #0	
+  add	r9, r10, #0	
+  add	r10, r2, #0	
+  ldr	r2, [r1, #16]
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_ee_loop2_exit
+
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_loop2:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vsub.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vadd.f32	d31, d5, d2  @
+	vadd.f32	d28, d4, d3  @
+	vsub.f32	d30, d4, d3  @
+	vsub.f32	d5, d19, d14  @
+	vsub.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vadd.f32	d6, d30, d27  @
+	vadd.f32	d4, d18, d15  @
+	vadd.f32	d13, d19, d14  @
+	vsub.f32	d12, d18, d15  @
+	vadd.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vsub.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_loop2
+_neon_ee_loop2_exit:
+
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
+
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_o_f
+_neon_static_o_f:
+#else
+	.globl	neon_static_o_f
+neon_static_o_f:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+  ldr lr, [r0, #40]  @ this is p->N
+  add	r3, r1, #0	
+  add	r7, r1, lr 
+  add	r5, r7, lr 
+  add	r10, r5, lr
+  add	r4, r10, lr	
+  add	r8, r4, lr
+  add	r6, r8, lr
+  add	r9, r6, lr
+  ldr	r12, [r0]
+  add	r1, r0, #0
+  add	r0, r2, #0	
+  ldr	r2, [r1, #16]   @ this is p->ee_ws
+  ldr	r11, [r1, #28]  @ this is p->i0	
+	
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_o_loop:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vsub.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vadd.f32	d31, d5, d2  @
+	vadd.f32	d28, d4, d3  @
+	vsub.f32	d30, d4, d3  @
+	vsub.f32	d5, d19, d14  @
+	vsub.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vadd.f32	d6, d30, d27  @
+	vadd.f32	d4, d18, d15  @
+	vadd.f32	d13, d19, d14  @
+	vsub.f32	d12, d18, d15  @
+	vadd.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vsub.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_o_loop
+
+  add	r2, r7, #0	
+  add	r7, r9, #0	
+  add	r9, r2, #0	
+  add	r2, r8, #0	
+  add	r8, r10, #0	
+  add	r10, r2, #0	
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_oo_o_loop_exit
+_neon_oo_o_loop:
+	vld2.32 {q8}, [r6, :128]!
+	vld2.32 {q9}, [r5, :128]!
+	vld2.32 {q10}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vadd.f32	q11, q9, q8
+	vsub.f32	q8, q9, q8
+	vsub.f32	q9, q13, q10
+	vadd.f32	q12, q13, q10
+	subs	r11, r11, #1
+	vld2.32 {q10}, [r7, :128]!
+	vld2.32 {q13}, [r9, :128]!
+	vsub.f32	q2, q12, q11
+	vadd.f32	d7, d19, d16  @
+	vsub.f32	d3, d19, d16  @
+	vsub.f32	d6, d18, d17  @
+	vadd.f32	d2, d18, d17  @
+	vld2.32 {q9}, [r8, :128]!
+	vld2.32 {q8}, [r10, :128]!
+	vadd.f32	q0, q12, q11
+	vadd.f32	q11, q13, q8
+	vadd.f32	q12, q10, q9
+	vsub.f32	q8, q13, q8
+	vsub.f32	q9, q10, q9
+	vsub.f32	q6, q12, q11
+	vadd.f32	q4, q12, q11
+	vtrn.32	q0, q2
+	ldr r2, [r12], #4
+	vadd.f32	d15, d19, d16  @
+	ldr lr, [r12], #4
+	vsub.f32	d11, d19, d16  @
+	vsub.f32	d14, d18, d17  @
+	vadd.f32	d10, d18, d17  @
+	add r2, r0, r2, lsl #2
+	vtrn.32	q1, q3
+	add lr, r0, lr, lsl #2
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_oo_o_loop
+_neon_oo_o_loop_exit:
+
+	ldr	r11, [r1, #8]
+	vld1.32 {q8}, [r5, :128]!
+	vld1.32 {q10}, [r6, :128]!
+	vld2.32 {q11}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vld2.32 {q15}, [r10, :128]!
+	vorr	d25, d17, d17
+	vorr	d24, d20, d20
+	vorr	d20, d16, d16
+	vsub.f32	q9, q13, q11
+	vadd.f32	q11, q13, q11
+	ldr r2, [r12], #4
+	vtrn.32	d24, d25
+	ldr lr, [r12], #4
+	vtrn.32	d20, d21
+	add r2, r0, r2, lsl #2
+	vsub.f32	q8, q10, q12
+	add lr, r0, lr, lsl #2
+	vadd.f32	q10, q10, q12
+	vadd.f32	q0, q11, q10
+	vsub.f32	d25, d19, d16  @
+	vadd.f32	d27, d19, d16  @
+	vsub.f32	q1, q11, q10
+	vadd.f32	d24, d18, d17  @
+	vsub.f32	d26, d18, d17  @
+	vtrn.32	q0, q12
+	vtrn.32	q1, q13
+	vld1.32	{d24, d25}, [r11, :128]
+	vswp d1, d2
+	vst1.32 {q0, q1}, [r2, :128]!
+	vld2.32 {q0}, [r9, :128]!
+	vadd.f32	q1, q0, q15
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vsub.f32	q15, q0, q15
+	vsub.f32	q0, q14, q13
+	vadd.f32	q3, q14, q13
+	vadd.f32	q2, q3, q1
+	vsub.f32	d29, d1, d30  @
+	vadd.f32	d27, d1, d30  @
+	vsub.f32	q3, q3, q1
+	vadd.f32	d28, d0, d31  @
+	vsub.f32	d26, d0, d31  @
+	vtrn.32	q2, q14
+	vtrn.32	q3, q13
+	vswp d5, d6
+	vst1.32 {q2, q3}, [r2, :128]!
+	vtrn.32	q11, q9
+	vtrn.32	q10, q8
+	vmul.f32	d20, d18, d25
+	vmul.f32	d22, d19, d24
+	vmul.f32	d21, d19, d25
+	vmul.f32	d18, d18, d24
+	vmul.f32	d19, d16, d25
+	vmul.f32	d30, d17, d24
+	vmul.f32	d23, d16, d24
+	vmul.f32	d24, d17, d25
+	vadd.f32	d17, d22, d20
+	vsub.f32	d16, d18, d21
+	vsub.f32	d21, d30, d19
+	vadd.f32	d20, d24, d23
+	vadd.f32	q9, q8, q10
+	vsub.f32	q8, q8, q10
+	vadd.f32	q4, q14, q9
+	vsub.f32	q6, q14, q9
+	vsub.f32	d11, d27, d16  @
+	vadd.f32	d15, d27, d16  @
+	vadd.f32	d10, d26, d17  @
+	vsub.f32	d14, d26, d17  @
+	vswp d9, d10
+	vswp d13, d14
+	vstmia lr!, {q4-q7}
+
+
+  add	r2, r3, #0
+  add	r3, r7, #0
+  add	r7, r2, #0
+  add	r2, r4, #0
+  add	r4, r8, #0
+  add	r8, r2, #0
+  add	r2, r5, #0
+  add	r5, r9, #0
+  add	r9, r2, #0
+  add	r2, r6, #0
+  add	r6, r10, #0
+  add	r10, r2, #0
+  add	r2, r9, #0	
+  add	r9, r10, #0	
+  add	r10, r2, #0	
+  ldr	r2, [r1, #16]
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_ee_o_loop2_exit
+
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_o_loop2:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vsub.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vadd.f32	d31, d5, d2  @
+	vadd.f32	d28, d4, d3  @
+	vsub.f32	d30, d4, d3  @
+	vsub.f32	d5, d19, d14  @
+	vsub.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vadd.f32	d6, d30, d27  @
+	vadd.f32	d4, d18, d15  @
+	vadd.f32	d13, d19, d14  @
+	vsub.f32	d12, d18, d15  @
+	vadd.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vsub.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_o_loop2
+_neon_ee_o_loop2_exit:
+
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+	.align	4
+#ifdef __APPLE__
+	.globl	_neon_static_x4_f
+_neon_static_x4_f:
+#else
+	.globl	neon_static_x4_f
+neon_static_x4_f:
+#endif
+@	add r3, r0, #0
+  push	{r4, r5, r6, lr}
+  vstmdb	sp!, {d8-d15}
+
+	vld1.32 {q8,q9}, [r0, :128]
+	add r4, r0, r1, lsl #1
+	vld1.32 {q10,q11}, [r4, :128]
+	add r5, r0, r1, lsl #2
+	vld1.32 {q12,q13}, [r5, :128]
+	add r6, r4, r1, lsl #2
+	vld1.32 {q14,q15}, [r6, :128]
+	vld1.32 {q2,q3}, [r2, :128]
+	
+	vmul.f32	q0, q13, q3
+	vmul.f32	q5, q12, q2
+	vmul.f32	q1, q14, q2
+	vmul.f32	q4, q14, q3
+	vmul.f32	q14, q12, q3
+	vmul.f32	q13, q13, q2
+	vmul.f32	q12, q15, q3
+	vmul.f32	q2, q15, q2
+	vsub.f32	q0, q5, q0
+	vadd.f32	q13, q13, q14
+	vadd.f32	q12, q12, q1
+	vsub.f32	q1, q2, q4
+	vadd.f32	q15, q0, q12
+	vsub.f32	q12, q0, q12
+	vadd.f32	q14, q13, q1
+	vsub.f32	q13, q13, q1
+	vadd.f32	q0, q8, q15
+	vadd.f32	q1, q9, q14
+	vadd.f32	q2, q10, q13  @
+	vsub.f32	q4, q8, q15
+	vsub.f32	q3, q11, q12  @
+	vst1.32 {q0,q1}, [r0, :128]
+	vsub.f32	q5, q9, q14
+	vsub.f32	q6, q10, q13  @
+	vadd.f32	q7, q11, q12  @
+	vst1.32 {q2,q3}, [r4, :128]
+	vst1.32 {q4,q5}, [r5, :128]
+	vst1.32 {q6,q7}, [r6, :128]
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, pc}
+
+
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_x8_f
+_neon_static_x8_f:
+#else
+	.globl	neon_static_x8_f
+neon_static_x8_f:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #5
+neon_x8_loop: 
+  vld1.32 {q2,q3}, [r12, :128]!
+  vld1.32 {q14,q15}, [r6, :128]
+  vld1.32 {q10,q11}, [r5, :128]
+  adds	r11, r11, #1
+  vmul.f32	q12, q15, q2
+  vmul.f32	q8, q14, q3
+  vmul.f32	q13, q14, q2
+  vmul.f32	q9, q10, q3
+  vmul.f32	q1, q10, q2
+  vmul.f32	q0, q11, q2
+  vmul.f32	q14, q11, q3
+  vmul.f32	q15, q15, q3
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q10, q12, q8
+  vadd.f32	q11, q0, q9
+  vadd.f32	q8, q15, q13
+  vld1.32 {q12,q13}, [r4, :128]
+  vsub.f32	q9, q1, q14
+  vsub.f32	q15, q11, q10
+  vsub.f32	q14, q9, q8
+  vadd.f32	q4, q12, q15  @
+  vsub.f32	q6, q12, q15  @ 
+  vsub.f32	q5, q13, q14  @
+  vadd.f32	q7, q13, q14  @
+  vld1.32 {q14,q15}, [r9, :128]
+  vld1.32 {q12,q13}, [r7, :128]
+  vmul.f32	q1, q14, q2
+  vmul.f32	q0, q14, q3
+  vst1.32 {q4,q5}, [r4, :128]
+  vmul.f32	q14, q15, q3
+  vmul.f32	q4, q15, q2
+  vadd.f32	q15, q9, q8
+  vst1.32 {q6,q7}, [r6, :128]
+  vmul.f32	q8, q12, q3
+  vmul.f32	q5, q13, q3
+  vmul.f32	q12, q12, q2
+  vmul.f32	q9, q13, q2
+  vadd.f32	q14, q14, q1
+  vsub.f32	q13, q4, q0
+  vadd.f32	q0, q9, q8
+  vld1.32 {q8,q9}, [r3, :128]
+  vadd.f32	q1, q11, q10
+  vsub.f32	q12, q12, q5
+  vadd.f32	q11, q8, q15
+  vsub.f32	q8, q8, q15
+  vadd.f32	q2, q12, q14
+  vsub.f32	q10, q0, q13
+  vadd.f32	q15, q0, q13
+  vadd.f32	q13, q9, q1
+  vsub.f32	q9, q9, q1
+  vsub.f32	q12, q12, q14
+  vadd.f32	q0, q11, q2
+  vadd.f32	q1, q13, q15
+  vsub.f32	q4, q11, q2
+  vadd.f32	q2, q8, q10  @
+  vsub.f32	q3, q9, q12  @
+  vst1.32 {q0,q1}, [r3, :128]!
+  vsub.f32	q5, q13, q15
+  vld1.32 {q14,q15}, [r10, :128]
+  vadd.f32	q7, q9, q12  @
+  vld1.32 {q12,q13}, [r8, :128]
+  vst1.32 {q2,q3}, [r5, :128]!
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q6, q8, q10  @
+  vmul.f32	q8, q14, q2
+  vst1.32 {q4,q5}, [r7, :128]!
+  vmul.f32	q10, q15, q3
+  vmul.f32	q9, q13, q3
+  vmul.f32	q11, q12, q2
+  vmul.f32	q14, q14, q3
+  vst1.32 {q6,q7}, [r9, :128]!
+  vmul.f32	q15, q15, q2
+  vmul.f32	q12, q12, q3
+  vmul.f32	q13, q13, q2
+  vadd.f32	q10, q10, q8
+  vsub.f32	q11, q11, q9
+  vld1.32 {q8,q9}, [r4, :128]
+  vsub.f32	q14, q15, q14
+  vadd.f32	q15, q13, q12
+  vadd.f32	q13, q11, q10
+  vadd.f32	q12, q15, q14
+  vsub.f32	q15, q15, q14
+  vsub.f32	q14, q11, q10
+  vld1.32 {q10,q11}, [r6, :128]
+  vadd.f32	q0, q8, q13
+  vadd.f32	q1, q9, q12
+  vadd.f32	q2, q10, q15  @
+  vsub.f32	q3, q11, q14  @
+  vsub.f32	q4, q8, q13
+  vst1.32 {q0,q1}, [r4, :128]!
+  vsub.f32	q5, q9, q12
+  vsub.f32	q6, q10, q15  @
+  vst1.32 {q2,q3}, [r6, :128]!
+  vadd.f32	q7, q11, q14  @
+  vst1.32 {q4,q5}, [r8, :128]!
+  vst1.32 {q6,q7}, [r10, :128]!
+	bne neon_x8_loop	
+
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_x8_t_f
+_neon_static_x8_t_f:
+#else
+	.globl	neon_static_x8_t_f
+neon_static_x8_t_f:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #5
+neon_x8_t_loop: 
+  vld1.32 {q2,q3}, [r12, :128]!
+  vld1.32 {q14,q15}, [r6, :128]
+  vld1.32 {q10,q11}, [r5, :128]
+  adds	r11, r11, #1
+  vmul.f32	q12, q15, q2
+  vmul.f32	q8, q14, q3
+  vmul.f32	q13, q14, q2
+  vmul.f32	q9, q10, q3
+  vmul.f32	q1, q10, q2
+  vmul.f32	q0, q11, q2
+  vmul.f32	q14, q11, q3
+  vmul.f32	q15, q15, q3
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q10, q12, q8
+  vadd.f32	q11, q0, q9
+  vadd.f32	q8, q15, q13
+  vld1.32 {q12,q13}, [r4, :128]
+  vsub.f32	q9, q1, q14
+  vsub.f32	q15, q11, q10
+  vsub.f32	q14, q9, q8
+  vadd.f32	q4, q12, q15  @
+  vsub.f32	q6, q12, q15  @
+  vsub.f32	q5, q13, q14  @
+  vadd.f32	q7, q13, q14  @
+  vld1.32 {q14,q15}, [r9, :128]
+  vld1.32 {q12,q13}, [r7, :128]
+  vmul.f32	q1, q14, q2
+  vmul.f32	q0, q14, q3
+  vst1.32 {q4,q5}, [r4, :128]
+  vmul.f32	q14, q15, q3
+  vmul.f32	q4, q15, q2
+  vadd.f32	q15, q9, q8
+  vst1.32 {q6,q7}, [r6, :128]
+  vmul.f32	q8, q12, q3
+  vmul.f32	q5, q13, q3
+  vmul.f32	q12, q12, q2
+  vmul.f32	q9, q13, q2
+  vadd.f32	q14, q14, q1
+  vsub.f32	q13, q4, q0
+  vadd.f32	q0, q9, q8
+  vld1.32 {q8,q9}, [r3, :128]
+  vadd.f32	q1, q11, q10
+  vsub.f32	q12, q12, q5
+  vadd.f32	q11, q8, q15
+  vsub.f32	q8, q8, q15
+  vadd.f32	q2, q12, q14
+  vsub.f32	q10, q0, q13
+  vadd.f32	q15, q0, q13
+  vadd.f32	q13, q9, q1
+  vsub.f32	q9, q9, q1
+  vsub.f32	q12, q12, q14
+  vadd.f32	q0, q11, q2
+  vadd.f32	q1, q13, q15
+  vsub.f32	q4, q11, q2
+  vadd.f32	q2, q8, q10  @
+  vsub.f32	q3, q9, q12  @
+  vst2.32 {q0,q1}, [r3, :128]!
+  vsub.f32	q5, q13, q15
+  vld1.32 {q14,q15}, [r10, :128]
+  vadd.f32	q7, q9, q12  @
+  vld1.32 {q12,q13}, [r8, :128]
+  vst2.32 {q2,q3}, [r5, :128]!
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q6, q8, q10  @
+  vmul.f32	q8, q14, q2
+  vst2.32 {q4,q5}, [r7, :128]!
+  vmul.f32	q10, q15, q3
+  vmul.f32	q9, q13, q3
+  vmul.f32	q11, q12, q2
+  vmul.f32	q14, q14, q3
+  vst2.32 {q6,q7}, [r9, :128]!
+  vmul.f32	q15, q15, q2
+  vmul.f32	q12, q12, q3
+  vmul.f32	q13, q13, q2
+  vadd.f32	q10, q10, q8
+  vsub.f32	q11, q11, q9
+  vld1.32 {q8,q9}, [r4, :128]
+  vsub.f32	q14, q15, q14
+  vadd.f32	q15, q13, q12
+  vadd.f32	q13, q11, q10
+  vadd.f32	q12, q15, q14
+  vsub.f32	q15, q15, q14
+  vsub.f32	q14, q11, q10
+  vld1.32 {q10,q11}, [r6, :128]
+  vadd.f32	q0, q8, q13
+  vadd.f32	q1, q9, q12
+  vadd.f32	q2, q10, q15  @
+  vsub.f32	q3, q11, q14  @
+  vsub.f32	q4, q8, q13
+  vst2.32 {q0,q1}, [r4, :128]!
+  vsub.f32	q5, q9, q12
+  vsub.f32	q6, q10, q15  @
+  vst2.32 {q2,q3}, [r6, :128]!
+  vadd.f32	q7, q11, q14  @
+  vst2.32 {q4,q5}, [r8, :128]!
+  vst2.32 {q6,q7}, [r10, :128]!
+	bne neon_x8_t_loop	
+	
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
--- a/3rdparty/ffts/ffts-master/src/neon_static_i.s
+++ b/3rdparty/ffts/ffts-master/src/neon_static_i.s
@@ -0,0 +1,955 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_e_i
+_neon_static_e_i:
+#else
+	.globl	neon_static_e_i
+neon_static_e_i:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+  ldr lr, [r0, #40]  @ this is p->N
+  add	r3, r1, #0	
+  add	r7, r1, lr 
+  add	r5, r7, lr 
+  add	r10, r5, lr
+  add	r4, r10, lr	
+  add	r8, r4, lr
+  add	r6, r8, lr
+  add	r9, r6, lr
+  ldr	r12, [r0]
+  add	r1, r0, #0
+  add	r0, r2, #0	
+  ldr	r2, [r1, #16]   @ this is p->ee_ws
+  ldr	r11, [r1, #28]  @ this is p->i0	
+	
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_loop:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vadd.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vsub.f32	d31, d5, d2  @
+	vsub.f32	d28, d4, d3  @
+	vadd.f32	d30, d4, d3  @
+	vadd.f32	d5, d19, d14  @
+	vadd.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vsub.f32	d6, d30, d27  @
+	vsub.f32	d4, d18, d15  @
+	vsub.f32	d13, d19, d14  @
+	vadd.f32	d12, d18, d15  @
+	vsub.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vadd.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_loop
+
+	ldr	r11, [r1, #12]
+	vld2.32 {q9}, [r5, :128]! @tag2
+	vld2.32 {q13}, [r3, :128]! @tag0
+	vld2.32 {q12}, [r4, :128]! @tag1
+	vld2.32 {q0}, [r7, :128]! @tag4
+	vsub.f32	q11, q13, q12
+	vld2.32 {q8}, [r6, :128]! @tag3
+	vadd.f32	q12, q13, q12
+	vsub.f32	q10, q9, q8
+	vadd.f32	q8, q9, q8
+	vadd.f32	q9, q12, q8
+	vadd.f32	d9, d23, d20  @
+	vsub.f32	d11, d23, d20  @
+	vsub.f32	q8, q12, q8
+	vsub.f32	d8, d22, d21  @
+	vadd.f32	d10, d22, d21  @
+	ldr r2, [r12], #4
+	vld1.32	{d20, d21}, [r11, :128]
+	ldr lr, [r12], #4
+	vtrn.32	q9, q4
+	add r2, r0, r2, lsl #2
+	vtrn.32	q8, q5
+	add lr, r0, lr, lsl #2
+	vswp d9,d10
+	vst1.32 {d8,d9,d10,d11}, [lr, :128]!
+	vld2.32 {q13}, [r10, :128]! @tag7
+	vld2.32 {q15}, [r9, :128]! @tag6
+	vld2.32 {q11}, [r8, :128]! @tag5
+	vsub.f32	q14, q15, q13
+	vsub.f32	q12, q0, q11
+	vadd.f32	q11, q0, q11
+	vadd.f32	q13, q15, q13
+	vadd.f32	d13, d29, d24  @
+	vadd.f32	q15, q13, q11
+	vsub.f32	d12, d28, d25  @
+	vsub.f32	d15, d29, d24  @
+	vadd.f32	d14, d28, d25  @
+	vtrn.32	q15, q6
+	vsub.f32	q15, q13, q11
+	vtrn.32	q15, q7
+	vswp d13, d14
+	vst1.32 {d12,d13,d14,d15}, [lr, :128]!
+	vtrn.32	q13, q14
+	vtrn.32	q11, q12
+	vmul.f32	d24, d26, d21
+	vmul.f32	d28, d27, d20
+	vmul.f32	d25, d26, d20
+	vmul.f32	d26, d27, d21
+	vmul.f32	d27, d22, d21
+	vmul.f32	d30, d23, d20
+	vmul.f32	d29, d23, d21
+	vmul.f32	d22, d22, d20
+	vsub.f32	d21, d28, d24
+	vadd.f32	d20, d26, d25
+	vadd.f32	d25, d30, d27
+	vsub.f32	d24, d22, d29
+	vadd.f32	q11, q12, q10
+	vsub.f32	q10, q12, q10
+	vadd.f32	q0, q9, q11
+	vsub.f32	q2, q9, q11
+	vadd.f32	d3, d17, d20  @
+	vsub.f32	d7, d17, d20  @
+	vsub.f32	d2, d16, d21  @
+	vadd.f32	d6, d16, d21  @
+	vswp d1, d2
+	vswp d5, d6
+	vstmia r2!, {q0-q3}
+
+  add	r2, r7, #0	
+  add	r7, r9, #0	
+  add	r9, r2, #0	
+  add	r2, r8, #0	
+  add	r8, r10, #0	
+  add	r10, r2, #0	
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_oo_loop_exit
+_neon_oo_loop:
+	vld2.32 {q8}, [r6, :128]!
+	vld2.32 {q9}, [r5, :128]!
+	vld2.32 {q10}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vadd.f32	q11, q9, q8
+	vsub.f32	q8, q9, q8
+	vsub.f32	q9, q13, q10
+	vadd.f32	q12, q13, q10
+	subs	r11, r11, #1
+	vld2.32 {q10}, [r7, :128]!
+	vld2.32 {q13}, [r9, :128]!
+	vsub.f32	q2, q12, q11
+	vsub.f32	d7, d19, d16  @
+	vadd.f32	d3, d19, d16  @
+	vadd.f32	d6, d18, d17  @
+	vsub.f32	d2, d18, d17  @
+	vld2.32 {q9}, [r8, :128]!
+	vld2.32 {q8}, [r10, :128]!
+	vadd.f32	q0, q12, q11
+	vadd.f32	q11, q13, q8
+	vadd.f32	q12, q10, q9
+	vsub.f32	q8, q13, q8
+	vsub.f32	q9, q10, q9
+	vsub.f32	q6, q12, q11
+	vadd.f32	q4, q12, q11
+	vtrn.32	q0, q2
+	ldr r2, [r12], #4
+	vsub.f32	d15, d19, d16  @
+	ldr lr, [r12], #4
+	vadd.f32	d11, d19, d16  @
+	vadd.f32	d14, d18, d17  @
+	vsub.f32	d10, d18, d17  @
+	add r2, r0, r2, lsl #2
+	vtrn.32	q1, q3
+	add lr, r0, lr, lsl #2
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_oo_loop
+_neon_oo_loop_exit:
+
+  add	r2, r3, #0
+  add	r3, r7, #0
+  add	r7, r2, #0
+  add	r2, r4, #0
+  add	r4, r8, #0
+  add	r8, r2, #0
+  add	r2, r5, #0
+  add	r5, r9, #0
+  add	r9, r2, #0
+  add	r2, r6, #0
+  add	r6, r10, #0
+  add	r10, r2, #0
+  add	r2, r9, #0	
+  add	r9, r10, #0	
+  add	r10, r2, #0	
+  ldr	r2, [r1, #16]
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_ee_loop2_exit
+
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_loop2:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vadd.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vsub.f32	d31, d5, d2  @
+	vsub.f32	d28, d4, d3  @
+	vadd.f32	d30, d4, d3  @
+	vadd.f32	d5, d19, d14  @
+	vadd.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vsub.f32	d6, d30, d27  @
+	vsub.f32	d4, d18, d15  @
+	vsub.f32	d13, d19, d14  @
+	vadd.f32	d12, d18, d15  @
+	vsub.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vadd.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_loop2
+_neon_ee_loop2_exit:
+
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
+
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_o_i
+_neon_static_o_i:
+#else
+	.globl	neon_static_o_i
+neon_static_o_i:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+  ldr lr, [r0, #40]  @ this is p->N
+  add	r3, r1, #0	
+  add	r7, r1, lr 
+  add	r5, r7, lr 
+  add	r10, r5, lr
+  add	r4, r10, lr	
+  add	r8, r4, lr
+  add	r6, r8, lr
+  add	r9, r6, lr
+  ldr	r12, [r0]
+  add	r1, r0, #0
+  add	r0, r2, #0	
+  ldr	r2, [r1, #16]   @ this is p->ee_ws
+  ldr	r11, [r1, #28]  @ this is p->i0	
+	
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_o_loop:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vadd.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vsub.f32	d31, d5, d2  @
+	vsub.f32	d28, d4, d3  @
+	vadd.f32	d30, d4, d3  @
+	vadd.f32	d5, d19, d14  @
+	vadd.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vsub.f32	d6, d30, d27  @
+	vsub.f32	d4, d18, d15  @
+	vsub.f32	d13, d19, d14  @
+	vadd.f32	d12, d18, d15  @
+	vsub.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vadd.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_o_loop
+
+  add	r2, r7, #0	
+  add	r7, r9, #0	
+  add	r9, r2, #0	
+  add	r2, r8, #0	
+  add	r8, r10, #0	
+  add	r10, r2, #0	
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_oo_o_loop_exit
+_neon_oo_o_loop:
+	vld2.32 {q8}, [r6, :128]!
+	vld2.32 {q9}, [r5, :128]!
+	vld2.32 {q10}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vadd.f32	q11, q9, q8
+	vsub.f32	q8, q9, q8
+	vsub.f32	q9, q13, q10
+	vadd.f32	q12, q13, q10
+	subs	r11, r11, #1
+	vld2.32 {q10}, [r7, :128]!
+	vld2.32 {q13}, [r9, :128]!
+	vsub.f32	q2, q12, q11
+	vsub.f32	d7, d19, d16  @
+	vadd.f32	d3, d19, d16  @
+	vadd.f32	d6, d18, d17  @
+	vsub.f32	d2, d18, d17  @
+	vld2.32 {q9}, [r8, :128]!
+	vld2.32 {q8}, [r10, :128]!
+	vadd.f32	q0, q12, q11
+	vadd.f32	q11, q13, q8
+	vadd.f32	q12, q10, q9
+	vsub.f32	q8, q13, q8
+	vsub.f32	q9, q10, q9
+	vsub.f32	q6, q12, q11
+	vadd.f32	q4, q12, q11
+	vtrn.32	q0, q2
+	ldr r2, [r12], #4
+	vsub.f32	d15, d19, d16  @
+	ldr lr, [r12], #4
+	vadd.f32	d11, d19, d16  @
+	vadd.f32	d14, d18, d17  @
+	vsub.f32	d10, d18, d17  @
+	add r2, r0, r2, lsl #2
+	vtrn.32	q1, q3
+	add lr, r0, lr, lsl #2
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_oo_o_loop
+_neon_oo_o_loop_exit:
+
+	ldr	r11, [r1, #8]
+	vld1.32 {q8}, [r5, :128]!
+	vld1.32 {q10}, [r6, :128]!
+	vld2.32 {q11}, [r4, :128]!
+	vld2.32 {q13}, [r3, :128]!
+	vld2.32 {q15}, [r10, :128]!
+	vorr	d25, d17, d17
+	vorr	d24, d20, d20
+	vorr	d20, d16, d16
+	vsub.f32	q9, q13, q11
+	vadd.f32	q11, q13, q11
+	ldr r2, [r12], #4
+	vtrn.32	d24, d25
+	ldr lr, [r12], #4
+	vtrn.32	d20, d21
+	add r2, r0, r2, lsl #2
+	vsub.f32	q8, q10, q12
+	add lr, r0, lr, lsl #2
+	vadd.f32	q10, q10, q12
+	vadd.f32	q0, q11, q10
+	vadd.f32	d25, d19, d16  @
+	vsub.f32	d27, d19, d16  @
+	vsub.f32	q1, q11, q10
+	vsub.f32	d24, d18, d17  @
+	vadd.f32	d26, d18, d17  @
+	vtrn.32	q0, q12
+	vtrn.32	q1, q13
+	vld1.32	{d24, d25}, [r11, :128]
+	vswp d1, d2
+	vst1.32 {q0, q1}, [r2, :128]!
+	vld2.32 {q0}, [r9, :128]!
+	vadd.f32	q1, q0, q15
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vsub.f32	q15, q0, q15
+	vsub.f32	q0, q14, q13
+	vadd.f32	q3, q14, q13
+	vadd.f32	q2, q3, q1
+	vadd.f32	d29, d1, d30  @
+	vsub.f32	d27, d1, d30  @
+	vsub.f32	q3, q3, q1
+	vsub.f32	d28, d0, d31  @
+	vadd.f32	d26, d0, d31  @
+	vtrn.32	q2, q14
+	vtrn.32	q3, q13
+	vswp d5, d6
+	vst1.32 {q2, q3}, [r2, :128]!
+	vtrn.32	q11, q9
+	vtrn.32	q10, q8
+	vmul.f32	d20, d18, d25
+	vmul.f32	d22, d19, d24
+	vmul.f32	d21, d19, d25
+	vmul.f32	d18, d18, d24
+	vmul.f32	d19, d16, d25
+	vmul.f32	d30, d17, d24
+	vmul.f32	d23, d16, d24
+	vmul.f32	d24, d17, d25
+	vadd.f32	d17, d22, d20
+	vsub.f32	d16, d18, d21
+	vsub.f32	d21, d30, d19
+	vadd.f32	d20, d24, d23
+	vadd.f32	q9, q8, q10
+	vsub.f32	q8, q8, q10
+	vadd.f32	q4, q14, q9
+	vsub.f32	q6, q14, q9
+	vadd.f32	d11, d27, d16  @
+	vsub.f32	d15, d27, d16  @
+	vsub.f32	d10, d26, d17  @
+	vadd.f32	d14, d26, d17  @
+	vswp d9, d10
+	vswp d13, d14
+	vstmia lr!, {q4-q7}
+
+
+  add	r2, r3, #0
+  add	r3, r7, #0
+  add	r7, r2, #0
+  add	r2, r4, #0
+  add	r4, r8, #0
+  add	r8, r2, #0
+  add	r2, r5, #0
+  add	r5, r9, #0
+  add	r9, r2, #0
+  add	r2, r6, #0
+  add	r6, r10, #0
+  add	r10, r2, #0
+  add	r2, r9, #0	
+  add	r9, r10, #0	
+  add	r10, r2, #0	
+  ldr	r2, [r1, #16]
+  ldr	r11, [r1, #32]  @ this is p->i1	
+	cmp r11, #0
+	beq _neon_ee_o_loop2_exit
+
+	vld1.32	{d16, d17}, [r2, :128]
+_neon_ee_o_loop2:
+	vld2.32 {q15}, [r10, :128]!
+	vld2.32 {q13}, [r8, :128]!
+	vld2.32 {q14}, [r7, :128]!
+	vld2.32 {q9},  [r4, :128]!
+	vld2.32 {q10}, [r3, :128]!
+	vld2.32 {q11}, [r6, :128]!
+	vld2.32 {q12}, [r5, :128]!
+	vsub.f32	q1, q14, q13
+	vld2.32 {q0}, [r9, :128]!
+	subs	r11, r11, #1
+	vsub.f32	q2, q0, q15
+	vadd.f32	q0, q0, q15
+	vmul.f32	d10, d2, d17
+	vmul.f32	d11, d3, d16
+	vmul.f32	d12, d3, d17
+	vmul.f32	d6, d4, d17  
+	vmul.f32	d7, d5, d16  
+	vmul.f32	d8, d4, d16  
+	vmul.f32	d9, d5, d17  
+	vmul.f32	d13, d2, d16
+	vsub.f32	d7, d7, d6
+	vadd.f32	d11, d11, d10
+	vsub.f32	q1, q12, q11
+	vsub.f32	q2, q10, q9
+	vadd.f32	d6, d9, d8
+	vadd.f32	q4, q14, q13
+	vadd.f32	q11, q12, q11
+	vadd.f32	q12, q10, q9
+	vsub.f32	d10, d13, d12
+	vsub.f32	q7, q4, q0
+	vsub.f32	q9, q12, q11
+	vsub.f32	q13, q5, q3
+	vadd.f32	d29, d5, d2  @
+	vadd.f32	q5, q5, q3
+	vadd.f32	q10, q4, q0
+	vadd.f32	q11, q12, q11
+	vsub.f32	d31, d5, d2  @
+	vsub.f32	d28, d4, d3  @
+	vadd.f32	d30, d4, d3  @
+	vadd.f32	d5, d19, d14  @
+	vadd.f32	d7, d31, d26  @
+	vadd.f32	q1, q14, q5
+	vadd.f32	q0, q11, q10
+	vsub.f32	d6, d30, d27  @
+	vsub.f32	d4, d18, d15  @
+	vsub.f32	d13, d19, d14  @
+	vadd.f32	d12, d18, d15  @
+	vsub.f32	d15, d31, d26  @
+	ldr r2, [r12], #4
+	vtrn.32	q1, q3
+	ldr lr, [r12], #4
+	vtrn.32	q0, q2
+	add r2, r0, r2, lsl #2
+	vsub.f32	q4, q11, q10
+	add lr, r0, lr, lsl #2
+	vsub.f32	q5, q14, q5
+	vadd.f32	d14, d30, d27 @
+	vst2.32 {q0,q1}, [r2, :128]!
+	vst2.32 {q2,q3}, [lr, :128]!
+	vtrn.32	q4, q6
+	vtrn.32	q5, q7
+	vst2.32 {q4,q5}, [r2, :128]!
+	vst2.32 {q6,q7}, [lr, :128]!
+	bne _neon_ee_o_loop2
+_neon_ee_o_loop2_exit:
+
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+	.align	4
+#ifdef __APPLE__
+	.globl	_neon_static_x4_i
+_neon_static_x4_i:
+#else
+	.globl	neon_static_x4_i
+neon_static_x4_i:
+#endif
+@	add r3, r0, #0
+  push	{r4, r5, r6, lr}
+  vstmdb	sp!, {d8-d15}
+
+	vld1.32 {q8,q9}, [r0, :128]
+	add r4, r0, r1, lsl #1
+	vld1.32 {q10,q11}, [r4, :128]
+	add r5, r0, r1, lsl #2
+	vld1.32 {q12,q13}, [r5, :128]
+	add r6, r4, r1, lsl #2
+	vld1.32 {q14,q15}, [r6, :128]
+	vld1.32 {q2,q3}, [r2, :128]
+	
+	vmul.f32	q0, q13, q3
+	vmul.f32	q5, q12, q2
+	vmul.f32	q1, q14, q2
+	vmul.f32	q4, q14, q3
+	vmul.f32	q14, q12, q3
+	vmul.f32	q13, q13, q2
+	vmul.f32	q12, q15, q3
+	vmul.f32	q2, q15, q2
+	vsub.f32	q0, q5, q0
+	vadd.f32	q13, q13, q14
+	vadd.f32	q12, q12, q1
+	vsub.f32	q1, q2, q4
+	vadd.f32	q15, q0, q12
+	vsub.f32	q12, q0, q12
+	vadd.f32	q14, q13, q1
+	vsub.f32	q13, q13, q1
+	vadd.f32	q0, q8, q15
+	vadd.f32	q1, q9, q14
+	vsub.f32	q2, q10, q13  @
+	vsub.f32	q4, q8, q15
+	vadd.f32	q3, q11, q12  @
+	vst1.32 {q0,q1}, [r0, :128]
+	vsub.f32	q5, q9, q14
+	vadd.f32	q6, q10, q13  @
+	vsub.f32	q7, q11, q12  @
+	vst1.32 {q2,q3}, [r4, :128]
+	vst1.32 {q4,q5}, [r5, :128]
+	vst1.32 {q6,q7}, [r6, :128]
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, pc}
+
+
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_x8_i
+_neon_static_x8_i:
+#else
+	.globl	neon_static_x8_i
+neon_static_x8_i:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #5
+neon_x8_loop: 
+  vld1.32 {q2,q3}, [r12, :128]!
+  vld1.32 {q14,q15}, [r6, :128]
+  vld1.32 {q10,q11}, [r5, :128]
+  adds	r11, r11, #1
+  vmul.f32	q12, q15, q2
+  vmul.f32	q8, q14, q3
+  vmul.f32	q13, q14, q2
+  vmul.f32	q9, q10, q3
+  vmul.f32	q1, q10, q2
+  vmul.f32	q0, q11, q2
+  vmul.f32	q14, q11, q3
+  vmul.f32	q15, q15, q3
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q10, q12, q8
+  vadd.f32	q11, q0, q9
+  vadd.f32	q8, q15, q13
+  vld1.32 {q12,q13}, [r4, :128]
+  vsub.f32	q9, q1, q14
+  vsub.f32	q15, q11, q10
+  vsub.f32	q14, q9, q8
+  vsub.f32	q4, q12, q15  @
+  vadd.f32	q6, q12, q15  @ 
+  vadd.f32	q5, q13, q14  @
+  vsub.f32	q7, q13, q14  @
+  vld1.32 {q14,q15}, [r9, :128]
+  vld1.32 {q12,q13}, [r7, :128]
+  vmul.f32	q1, q14, q2
+  vmul.f32	q0, q14, q3
+  vst1.32 {q4,q5}, [r4, :128]
+  vmul.f32	q14, q15, q3
+  vmul.f32	q4, q15, q2
+  vadd.f32	q15, q9, q8
+  vst1.32 {q6,q7}, [r6, :128]
+  vmul.f32	q8, q12, q3
+  vmul.f32	q5, q13, q3
+  vmul.f32	q12, q12, q2
+  vmul.f32	q9, q13, q2
+  vadd.f32	q14, q14, q1
+  vsub.f32	q13, q4, q0
+  vadd.f32	q0, q9, q8
+  vld1.32 {q8,q9}, [r3, :128]
+  vadd.f32	q1, q11, q10
+  vsub.f32	q12, q12, q5
+  vadd.f32	q11, q8, q15
+  vsub.f32	q8, q8, q15
+  vadd.f32	q2, q12, q14
+  vsub.f32	q10, q0, q13
+  vadd.f32	q15, q0, q13
+  vadd.f32	q13, q9, q1
+  vsub.f32	q9, q9, q1
+  vsub.f32	q12, q12, q14
+  vadd.f32	q0, q11, q2
+  vadd.f32	q1, q13, q15
+  vsub.f32	q4, q11, q2
+  vsub.f32	q2, q8, q10  @
+  vadd.f32	q3, q9, q12  @
+  vst1.32 {q0,q1}, [r3, :128]!
+  vsub.f32	q5, q13, q15
+  vld1.32 {q14,q15}, [r10, :128]
+  vsub.f32	q7, q9, q12  @
+  vld1.32 {q12,q13}, [r8, :128]
+  vst1.32 {q2,q3}, [r5, :128]!
+  vld1.32 {q2,q3}, [r12, :128]!
+  vadd.f32	q6, q8, q10  @
+  vmul.f32	q8, q14, q2
+  vst1.32 {q4,q5}, [r7, :128]!
+  vmul.f32	q10, q15, q3
+  vmul.f32	q9, q13, q3
+  vmul.f32	q11, q12, q2
+  vmul.f32	q14, q14, q3
+  vst1.32 {q6,q7}, [r9, :128]!
+  vmul.f32	q15, q15, q2
+  vmul.f32	q12, q12, q3
+  vmul.f32	q13, q13, q2
+  vadd.f32	q10, q10, q8
+  vsub.f32	q11, q11, q9
+  vld1.32 {q8,q9}, [r4, :128]
+  vsub.f32	q14, q15, q14
+  vadd.f32	q15, q13, q12
+  vadd.f32	q13, q11, q10
+  vadd.f32	q12, q15, q14
+  vsub.f32	q15, q15, q14
+  vsub.f32	q14, q11, q10
+  vld1.32 {q10,q11}, [r6, :128]
+  vadd.f32	q0, q8, q13
+  vadd.f32	q1, q9, q12
+  vsub.f32	q2, q10, q15  @
+  vadd.f32	q3, q11, q14  @
+  vsub.f32	q4, q8, q13
+  vst1.32 {q0,q1}, [r4, :128]!
+  vsub.f32	q5, q9, q12
+  vadd.f32	q6, q10, q15  @
+  vst1.32 {q2,q3}, [r6, :128]!
+  vsub.f32	q7, q11, q14  @
+  vst1.32 {q4,q5}, [r8, :128]!
+  vst1.32 {q6,q7}, [r10, :128]!
+	bne neon_x8_loop	
+
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_neon_static_x8_t_i
+_neon_static_x8_t_i:
+#else
+	.globl	neon_static_x8_t_i
+neon_static_x8_t_i:
+#endif
+  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
+  vstmdb	sp!, {d8-d15}
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #5
+neon_x8_t_loop: 
+  vld1.32 {q2,q3}, [r12, :128]!
+  vld1.32 {q14,q15}, [r6, :128]
+  vld1.32 {q10,q11}, [r5, :128]
+  adds	r11, r11, #1
+  vmul.f32	q12, q15, q2
+  vmul.f32	q8, q14, q3
+  vmul.f32	q13, q14, q2
+  vmul.f32	q9, q10, q3
+  vmul.f32	q1, q10, q2
+  vmul.f32	q0, q11, q2
+  vmul.f32	q14, q11, q3
+  vmul.f32	q15, q15, q3
+  vld1.32 {q2,q3}, [r12, :128]!
+  vsub.f32	q10, q12, q8
+  vadd.f32	q11, q0, q9
+  vadd.f32	q8, q15, q13
+  vld1.32 {q12,q13}, [r4, :128]
+  vsub.f32	q9, q1, q14
+  vsub.f32	q15, q11, q10
+  vsub.f32	q14, q9, q8
+  vsub.f32	q4, q12, q15  @
+  vadd.f32	q6, q12, q15  @
+  vadd.f32	q5, q13, q14  @
+  vsub.f32	q7, q13, q14  @
+  vld1.32 {q14,q15}, [r9, :128]
+  vld1.32 {q12,q13}, [r7, :128]
+  vmul.f32	q1, q14, q2
+  vmul.f32	q0, q14, q3
+  vst1.32 {q4,q5}, [r4, :128]
+  vmul.f32	q14, q15, q3
+  vmul.f32	q4, q15, q2
+  vadd.f32	q15, q9, q8
+  vst1.32 {q6,q7}, [r6, :128]
+  vmul.f32	q8, q12, q3
+  vmul.f32	q5, q13, q3
+  vmul.f32	q12, q12, q2
+  vmul.f32	q9, q13, q2
+  vadd.f32	q14, q14, q1
+  vsub.f32	q13, q4, q0
+  vadd.f32	q0, q9, q8
+  vld1.32 {q8,q9}, [r3, :128]
+  vadd.f32	q1, q11, q10
+  vsub.f32	q12, q12, q5
+  vadd.f32	q11, q8, q15
+  vsub.f32	q8, q8, q15
+  vadd.f32	q2, q12, q14
+  vsub.f32	q10, q0, q13
+  vadd.f32	q15, q0, q13
+  vadd.f32	q13, q9, q1
+  vsub.f32	q9, q9, q1
+  vsub.f32	q12, q12, q14
+  vadd.f32	q0, q11, q2
+  vadd.f32	q1, q13, q15
+  vsub.f32	q4, q11, q2
+  vsub.f32	q2, q8, q10  @
+  vadd.f32	q3, q9, q12  @
+  vst2.32 {q0,q1}, [r3, :128]!
+  vsub.f32	q5, q13, q15
+  vld1.32 {q14,q15}, [r10, :128]
+  vsub.f32	q7, q9, q12  @
+  vld1.32 {q12,q13}, [r8, :128]
+  vst2.32 {q2,q3}, [r5, :128]!
+  vld1.32 {q2,q3}, [r12, :128]!
+  vadd.f32	q6, q8, q10  @
+  vmul.f32	q8, q14, q2
+  vst2.32 {q4,q5}, [r7, :128]!
+  vmul.f32	q10, q15, q3
+  vmul.f32	q9, q13, q3
+  vmul.f32	q11, q12, q2
+  vmul.f32	q14, q14, q3
+  vst2.32 {q6,q7}, [r9, :128]!
+  vmul.f32	q15, q15, q2
+  vmul.f32	q12, q12, q3
+  vmul.f32	q13, q13, q2
+  vadd.f32	q10, q10, q8
+  vsub.f32	q11, q11, q9
+  vld1.32 {q8,q9}, [r4, :128]
+  vsub.f32	q14, q15, q14
+  vadd.f32	q15, q13, q12
+  vadd.f32	q13, q11, q10
+  vadd.f32	q12, q15, q14
+  vsub.f32	q15, q15, q14
+  vsub.f32	q14, q11, q10
+  vld1.32 {q10,q11}, [r6, :128]
+  vadd.f32	q0, q8, q13
+  vadd.f32	q1, q9, q12
+  vsub.f32	q2, q10, q15  @
+  vadd.f32	q3, q11, q14  @
+  vsub.f32	q4, q8, q13
+  vst2.32 {q0,q1}, [r4, :128]!
+  vsub.f32	q5, q9, q12
+  vadd.f32	q6, q10, q15  @
+  vst2.32 {q2,q3}, [r6, :128]!
+  vsub.f32	q7, q11, q14  @
+  vst2.32 {q4,q5}, [r8, :128]!
+  vst2.32 {q6,q7}, [r10, :128]!
+	bne neon_x8_t_loop	
+	
+	vldmia	sp!, {d8-d15}
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+
--- a/3rdparty/ffts/ffts-master/src/patterns.c
+++ b/3rdparty/ffts/ffts-master/src/patterns.c
@@ -0,0 +1,209 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "patterns.h"
+
+void permute_addr(int N, int offset, int stride, int *d) {
+    int i, a[4] = {0,2,1,3};
+    for(i=0;i<4;i++) {
+        d[i] = offset + (a[i] << stride);
+        if(d[i] < 0) d[i] += N;
+    }
+}
+
+void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
+ 
+	if(N > 4) {  
+    ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);  
+		if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
+		if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
+		else {
+			int temp = poffset+(1<<stride);
+			if(temp < 0) temp += bigN;
+			temp *= 2;
+
+			if(!(temp % (VL*2))) { 
+				(*is)[0] = poffset+(1<<stride);
+				(*is)[1] = poffset+(1<<stride)+(1<<(stride+2));
+				(*is)[2] = poffset-(1<<stride);
+				(*is)[3] = poffset-(1<<stride)+(1<<(stride+2));
+				int i;
+				for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN;
+				for(i=0;i<4;i++) (*is)[i] *= 2; 
+				*is += 4;
+			}
+		}
+  }else if(N == 4) {
+		int perm[4];
+		permute_addr(bigN, poffset, stride, perm);
+		if(!((perm[0]*2) % (VL*2))) { 
+			int i;
+			for(i=0;i<4;i++) {
+				(*is)[i] = perm[i] * 2;
+				}
+			*is += 4;
+		}
+	}
+}
+
+void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) {
+	int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
+	int stride = log(N/leafN)/log(2);
+	
+	p->is = malloc(N/VL * sizeof(ptrdiff_t));
+	
+	ptrdiff_t *is = p->is;
+
+	if((N/leafN) % 3 > 1) i1++;
+
+	for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+	for(i=i0;i<i0+i1;i++) {
+		ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
+		ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
+	}
+	for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+
+
+//for(i=0;i<N/VL;i++) {
+//	printf("%td ", p->is[i]);
+//	if(i % 16 == 15) printf("\n");
+//}
+
+	p->i0 = i0; p->i1 = i1;
+}
+/**
+ *
+ *
+ */
+void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
+  if((even && N == leafN) || (!even && N <= leafN)) {
+		offsets[2*(ooffset/leafN)]   = ioffset*2;
+		offsets[2*(ooffset/leafN)+1] = ooffset;
+	}else if(N > 4) {
+		ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
+		ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
+  	if(N/4 >= leafN) 
+		ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
+	}
+
+}
+
+int compare_offsets(const void *a, const void *b) {
+	return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
+}
+
+uint32_t reverse_bits(uint32_t a, int n) {
+	uint32_t x = 0;
+
+	int i;
+	for(i=0;i<n;i++) {
+		if(a & (1 << i)) x |= 1 << (n-i-1);
+	}
+	return x;
+}
+
+
+void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
+
+	ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
+
+	ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
+
+	size_t i;
+	for(i=0;i<2*N/leafN;i+=2) {
+		if(offsets[i] < 0) offsets[i] = N + offsets[i];
+	}
+	
+	qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets); 
+	//elaborate_is(p, N, 0, 0, 1);
+	p->offsets = malloc(N/leafN * sizeof(ptrdiff_t));
+	for(i=0;i<N/leafN;i++) {
+		p->offsets[i] = offsets[i*2+1]*2;
+	}
+//for(i=0;i<N/leafN;i++) {
+//	printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
+//}
+	free(offsets);
+}
+
+/*
+int tree_count(int N, int leafN, int offset) {
+	
+	if(N <= leafN) return 0;
+	int count = 0;	
+	count += tree_count(N/4, leafN, offset);
+	count += tree_count(N/8, leafN, offset + N/4);
+	count += tree_count(N/8, leafN, offset + N/4 + N/8);
+	count += tree_count(N/4, leafN, offset + N/2);
+	count += tree_count(N/4, leafN, offset + 3*N/4);
+
+	return 1 + count;
+}
+
+void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
+	
+	if(N <= leafN) return;
+	elaborate_tree(p, N/4, leafN, offset);
+	elaborate_tree(p, N/8, leafN, offset + N/4);
+	elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
+	elaborate_tree(p, N/4, leafN, offset + N/2);
+	elaborate_tree(p, N/4, leafN, offset + 3*N/4);
+
+	(*p)[0] = N;
+	(*p)[1] = offset*2;
+
+	(*p)+=2;
+}
+
+void ffts_init_tree(ffts_plan_t *p, int N, int leafN) {
+
+	int count = tree_count(N, leafN, 0) + 1;
+	transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
+
+//printf("count = %d\n", count);
+
+	elaborate_tree(&ps, N, leafN, 0);
+	#ifdef __ARM_NEON__
+	ps -= 2;
+	#endif
+	ps[0] = 0;
+	ps[1] = 0;
+//int i;
+//for(i=0;i<count;i++) {
+//	fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1],
+//		__builtin_ctzl(p->transforms[i*2]) - 5);
+//}
+
+}
+*/
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/patterns.h
+++ b/3rdparty/ffts/ffts-master/src/patterns.h
@@ -0,0 +1,45 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#ifndef __PATTERNS_H__
+#define __PATTERNS_H__
+
+#include "ffts.h"
+
+void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL); 
+void ffts_init_offsets(ffts_plan_t *p, int N, int leafN); 
+//void ffts_init_tree(ffts_plan_t *p, int N, int leafN); 
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/sse.s
+++ b/3rdparty/ffts/ffts-master/src/sse.s
@@ -0,0 +1,878 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+	.globl	_neon_x4
+	.align	4
+_neon_x4:
+
+	.globl _neon_x8
+	.align 4
+_neon_x8:
+
+	.globl _neon_x8_t
+	.align 4
+_neon_x8_t:
+
+
+#ifdef __APPLE__
+	.globl _leaf_ee_init
+_leaf_ee_init:
+#else
+	.globl leaf_ee_init
+leaf_ee_init:
+#endif
+ 		#lea L_sse_constants(%rip), %r9 
+		movq 0xe0(%rdi), %r9
+		xorl	%eax, %eax
+# eax is loop counter (init to 0)
+# rcx is loop max count
+# rsi is 'in' base pointer
+# rdx is 'out' base pointer
+# r8 is offsets pointer
+# r9 is constants pointer
+# scratch: rax r11 r12
+#	.align	4, 0x90
+
+# _leaf_ee + 9 needs 16 byte alignment
+#ifdef __APPLE__
+	.globl _leaf_ee
+_leaf_ee:
+#else
+	.globl leaf_ee
+leaf_ee:
+#endif
+		movaps    32(%r9), %xmm0            #83.5
+ 		movaps    (%r9), %xmm8            #83.5
+LEAF_EE_1:
+LEAF_EE_const_0:
+				movaps    0xFECA(%rsi,%rax,4), %xmm7                           #83.5
+LEAF_EE_const_2:
+        movaps    0xFECA(%rsi,%rax,4), %xmm12                         #83.5
+        movaps    %xmm7, %xmm6                                  #83.5
+LEAF_EE_const_3:
+        movaps    0xFECA(%rsi,%rax,4), %xmm10                         #83.5
+        movaps    %xmm12, %xmm11                                #83.5
+        subps     %xmm10, %xmm12                                #83.5
+        addps     %xmm10, %xmm11                                #83.5
+        xorps     %xmm8, %xmm12                                 #83.5
+LEAF_EE_const_1:
+        movaps    0xFECA(%rsi,%rax,4), %xmm9                          #83.5
+LEAF_EE_const_4:
+        movaps    0xFECA(%rsi,%rax,4), %xmm10                         #83.5
+        addps     %xmm9, %xmm6                                  #83.5
+        subps     %xmm9, %xmm7                                  #83.5
+LEAF_EE_const_5:
+        movaps    0xFECA(%rsi,%rax,4), %xmm13                         #83.5
+        movaps    %xmm10, %xmm9                                 #83.5
+LEAF_EE_const_6:
+        movaps    0xFECA(%rsi,%rax,4), %xmm3                          #83.5
+        movaps    %xmm6, %xmm5                                  #83.5
+LEAF_EE_const_7:
+        movaps    0xFECA(%rsi,%rax,4), %xmm14                          #83.5
+        movaps    %xmm3, %xmm15                                 #83.5
+        shufps    $177, %xmm12, %xmm12                          #83.5
+        movaps    %xmm7, %xmm4                                  #83.5
+        movslq    (%r8, %rax, 4), %r11                                   #83.44
+        subps     %xmm13, %xmm10                                #83.5
+        subps     %xmm14, %xmm3                                 #83.5
+        addps     %xmm11, %xmm5                                 #83.5
+        subps     %xmm11, %xmm6                                 #83.5
+        subps     %xmm12, %xmm4                                 #83.5
+        addps     %xmm12, %xmm7                                 #83.5
+        addps     %xmm13, %xmm9                                 #83.5
+        addps     %xmm14, %xmm15                                #83.5
+        movaps    16(%r9), %xmm12           #83.5
+        movaps    %xmm9, %xmm1                                  #83.5
+        movaps    16(%r9), %xmm11           #83.5
+        movaps    %xmm5, %xmm2                                  #83.5
+        mulps     %xmm10, %xmm12                                #83.5
+        subps     %xmm15, %xmm9                                 #83.5
+        addps     %xmm15, %xmm1                                 #83.5
+        mulps     %xmm3, %xmm11                                 #83.5
+        addps     %xmm1, %xmm2                                  #83.5
+        subps     %xmm1, %xmm5                                  #83.5
+        shufps    $177, %xmm10, %xmm10                          #83.5
+        xorps     %xmm8, %xmm9                                  #83.5
+        shufps    $177, %xmm3, %xmm3                            #83.5
+        movaps    %xmm6, %xmm1                                  #83.5
+        mulps     %xmm0, %xmm10                                 #83.5
+        movaps    %xmm4, %xmm13                                 #83.5
+        mulps     %xmm0, %xmm3                                  #83.5
+        subps     %xmm10, %xmm12                                #83.5
+        addps     %xmm3, %xmm11                                 #83.5
+        movaps    %xmm12, %xmm3                                 #83.5
+        movaps    %xmm7, %xmm14                                 #83.5
+        shufps    $177, %xmm9, %xmm9                            #83.5
+        subps     %xmm11, %xmm12                                #83.5
+        addps     %xmm11, %xmm3                                 #83.5
+        subps     %xmm9, %xmm1                                  #83.5
+        addps     %xmm9, %xmm6                                  #83.5
+        addps     %xmm3, %xmm4                                  #83.5
+        subps     %xmm3, %xmm13                                 #83.5
+        xorps     %xmm8, %xmm12                                 #83.5
+        movaps    %xmm2, %xmm3                                  #83.5
+        shufps    $177, %xmm12, %xmm12                          #83.5
+        movaps    %xmm6, %xmm9                                  #83.5
+        movslq    8(%r8, %rax, 4), %r12                                  #83.59
+        movlhps   %xmm4, %xmm3                                  #83.5
+				addq	    $4, %rax
+        shufps    $238, %xmm4, %xmm2                            #83.5
+        movaps    %xmm1, %xmm4                                  #83.5
+        #movntdq    %xmm3, (%rdx,%r11,4)                          #83.5
+        subps     %xmm12, %xmm7                                 #83.5
+        addps     %xmm12, %xmm14                                #83.5
+        movlhps   %xmm7, %xmm4                                  #83.5
+        shufps    $238, %xmm7, %xmm1                            #83.5
+        movaps    %xmm5, %xmm7                                  #83.5
+        movlhps   %xmm13, %xmm7                                 #83.5
+        movlhps   %xmm14, %xmm9                                 #83.5
+        shufps    $238, %xmm13, %xmm5                           #83.5
+        shufps    $238, %xmm14, %xmm6                           #83.5
+        movaps    %xmm3, (%rdx,%r11,4)                          #83.5
+        movaps    %xmm4, 16(%rdx,%r11,4)                        #83.5
+        movaps    %xmm7, 32(%rdx,%r11,4)                        #83.5
+        movaps    %xmm9, 48(%rdx,%r11,4)                        #83.5
+        movaps    %xmm2, (%rdx,%r12,4)                          #83.5
+        movaps    %xmm1, 16(%rdx,%r12,4)                        #83.5
+        movaps    %xmm5, 32(%rdx,%r12,4)                        #83.5
+        movaps    %xmm6, 48(%rdx,%r12,4)                        #83.5
+				cmpq	%rcx, %rax
+        jne      LEAF_EE_1 
+        
+
+
+# _leaf_oo + 4 needs to be 16 byte aligned
+#ifdef __APPLE__
+	.globl _leaf_oo
+_leaf_oo:
+#else
+	.globl leaf_oo
+leaf_oo:
+#endif
+        movaps    (%r9), %xmm5            #92.7
+LEAF_OO_1:
+LEAF_OO_const_0:
+        movaps    0xFECA(%rsi,%rax,4), %xmm4                           #93.5
+        movaps    %xmm4, %xmm6                                  #93.5
+LEAF_OO_const_1:
+        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #93.5
+LEAF_OO_const_2:
+        movaps    0xFECA(%rsi,%rax,4), %xmm10                         #93.5
+        addps     %xmm7, %xmm6                                  #93.5
+        subps     %xmm7, %xmm4                                  #93.5
+LEAF_OO_const_3:
+        movaps    0xFECA(%rsi,%rax,4), %xmm8                          #93.5
+        movaps    %xmm10, %xmm9                                 #93.5
+LEAF_OO_const_4:
+        movaps    0xFECA(%rsi,%rax,4), %xmm1                          #93.5
+        movaps    %xmm6, %xmm3                                  #93.5
+LEAF_OO_const_5:
+        movaps    0xFECA(%rsi,%rax,4), %xmm11                         #93.5
+        movaps    %xmm1, %xmm2                                  #93.5
+LEAF_OO_const_6:
+        movaps    0xFECA(%rsi,%rax,4), %xmm14                         #93.5
+        movaps    %xmm4, %xmm15                                 #93.5
+LEAF_OO_const_7:
+        movaps    0xFECA(%rsi,%rax,4), %xmm12                          #93.5
+        movaps    %xmm14, %xmm13                                #93.5
+        movslq    (%r8, %rax, 4), %r11                                   #83.44
+        subps     %xmm8, %xmm10                                 #93.5
+        addps     %xmm8, %xmm9                                  #93.5
+        addps     %xmm11, %xmm2                                 #93.5
+        subps     %xmm12, %xmm14                                #93.5
+        subps     %xmm11, %xmm1                                 #93.5
+        addps     %xmm12, %xmm13                                #93.5
+        addps     %xmm9, %xmm3                                  #93.5
+        subps     %xmm9, %xmm6                                  #93.5
+        xorps     %xmm5, %xmm10                                 #93.5
+        xorps     %xmm5, %xmm14                                 #93.5
+        shufps    $177, %xmm10, %xmm10                          #93.5
+        movaps    %xmm2, %xmm9                                  #93.5
+        shufps    $177, %xmm14, %xmm14                          #93.5
+        movaps    %xmm6, %xmm7                                  #93.5
+        movslq    8(%r8, %rax, 4), %r12                                  #83.59
+        addq      $4, %rax                                          #92.18
+        addps     %xmm10, %xmm4                                 #93.5
+        addps     %xmm13, %xmm9                                 #93.5
+        subps     %xmm13, %xmm2                                 #93.5
+        subps     %xmm10, %xmm15                                #93.5
+        movaps    %xmm1, %xmm13                                 #93.5
+        movaps    %xmm2, %xmm8                                  #93.5
+        movlhps   %xmm4, %xmm7                                  #93.5
+        subps     %xmm14, %xmm13                                #93.5
+        addps     %xmm14, %xmm1                                 #93.5
+        shufps    $238, %xmm4, %xmm6                            #93.5
+        movaps    %xmm3, %xmm14                                 #93.5
+        movaps    %xmm9, %xmm4                                  #93.5
+        movlhps   %xmm15, %xmm14                                #93.5
+        movlhps   %xmm13, %xmm4                                 #93.5
+        movlhps   %xmm1, %xmm8                                  #93.5
+        shufps    $238, %xmm15, %xmm3                           #93.5
+        shufps    $238, %xmm13, %xmm9                           #93.5
+        shufps    $238, %xmm1, %xmm2                            #93.5
+        movaps    %xmm14, (%rdx,%r11,4)                         #93.5
+        movaps    %xmm7, 16(%rdx,%r11,4)                        #93.5
+        movaps    %xmm4, 32(%rdx,%r11,4)                        #93.5
+        movaps    %xmm8, 48(%rdx,%r11,4)                        #93.5
+        movaps    %xmm3, (%rdx,%r12,4)                          #93.5
+        movaps    %xmm6, 16(%rdx,%r12,4)                        #93.5
+        movaps    %xmm9, 32(%rdx,%r12,4)                        #93.5
+        movaps    %xmm2, 48(%rdx,%r12,4)                        #93.5
+				cmpq	%rcx, %rax
+        jne       LEAF_OO_1       # Prob 95%                      #92.14
+
+#ifdef __APPLE__
+	.globl _leaf_eo
+_leaf_eo:
+#else
+	.globl leaf_eo
+leaf_eo:
+#endif
+LEAF_EO_const_0:
+        movaps    0xFECA(%rsi,%rax,4), %xmm9                          #88.5
+LEAF_EO_const_2:
+        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #88.5
+        movaps    %xmm9, %xmm11                                 #88.5
+LEAF_EO_const_3:
+        movaps    0xFECA(%rsi,%rax,4), %xmm5                           #88.5
+        movaps    %xmm7, %xmm6                                  #88.5
+LEAF_EO_const_1:
+        movaps    0xFECA(%rsi,%rax,4), %xmm4                          #88.5
+        subps     %xmm5, %xmm7                                  #88.5
+        addps     %xmm4, %xmm11                                 #88.5
+        subps     %xmm4, %xmm9                                  #88.5
+        addps     %xmm5, %xmm6                                  #88.5
+        movaps    (%r9), %xmm3            #88.5
+        movaps    %xmm11, %xmm10                                #88.5
+        xorps     %xmm3, %xmm7                                  #88.5
+        movaps    %xmm9, %xmm8                                  #88.5
+        shufps    $177, %xmm7, %xmm7                            #88.5
+        addps     %xmm6, %xmm10                                 #88.5
+        subps     %xmm6, %xmm11                                 #88.5
+        subps     %xmm7, %xmm8                                  #88.5
+        addps     %xmm7, %xmm9                                  #88.5
+        movslq    8(%r8, %rax, 4), %r12                                  #83.59
+        movaps    %xmm10, %xmm2                                 #88.5
+        movslq    (%r8, %rax, 4), %r11                                   #83.44
+        movaps    %xmm11, %xmm1                                 #88.5
+        shufps    $238, %xmm8, %xmm10                           #88.5
+        shufps    $238, %xmm9, %xmm11                           #88.5
+        movaps    %xmm10, (%rdx,%r12,4)                         #88.5
+        movaps    %xmm11, 16(%rdx,%r12,4)                       #88.5
+LEAF_EO_const_4:
+        movaps    0xFECA(%rsi,%rax,4), %xmm15                         #88.5
+LEAF_EO_const_5:
+        movaps    0xFECA(%rsi,%rax,4), %xmm12                         #88.5
+        movaps    %xmm15, %xmm14                                #88.5
+LEAF_EO_const_6:
+        movaps    0xFECA(%rsi,%rax,4), %xmm4                          #88.5
+        addps     %xmm12, %xmm14                                #88.5
+        subps     %xmm12, %xmm15                                #88.5
+LEAF_EO_const_7:
+        movaps    0xFECA(%rsi,%rax,4), %xmm13                         #88.5
+        movaps    %xmm4, %xmm5                                  #88.5
+        movaps    %xmm14, %xmm7                                 #88.5
+        addps     %xmm13, %xmm5                                 #88.5
+        subps     %xmm13, %xmm4                                 #88.5
+        movlhps   %xmm8, %xmm2                                  #88.5
+        movaps    %xmm5, %xmm8                                  #88.5
+        movlhps   %xmm15, %xmm7                                 #88.5
+        xorps     %xmm3, %xmm15                                 #88.5
+        movaps    %xmm5, %xmm6                                  #88.5
+        subps     %xmm14, %xmm5                                 #88.5
+        addps     %xmm14, %xmm6                                 #88.5
+        movlhps   %xmm9, %xmm1                                  #88.5
+        movaps    %xmm4, %xmm14                                 #88.5
+        movlhps   %xmm4, %xmm8                                  #88.5
+        movaps    %xmm1, %xmm12                                 #88.5
+        shufps    $177, %xmm15, %xmm15                          #88.5
+        movaps    0x30(%r9), %xmm11           #88.5
+        addq      $4, %rax                                       #90.5
+        subps     %xmm15, %xmm14                                #88.5
+        mulps     %xmm7, %xmm11                                 #88.5
+        addps     %xmm15, %xmm4                                 #88.5
+        movaps    0x30(%r9), %xmm9            #88.5
+        movaps    0x40(%r9), %xmm15           #88.5
+        shufps    $177, %xmm7, %xmm7                            #88.5
+        mulps     %xmm8, %xmm9                                  #88.5
+        mulps     %xmm15, %xmm7                                 #88.5
+        shufps    $177, %xmm8, %xmm8                            #88.5
+        subps     %xmm7, %xmm11                                 #88.5
+        mulps     %xmm15, %xmm8                                 #88.5
+        movaps    %xmm11, %xmm10                                #88.5
+        addps     %xmm8, %xmm9                                  #88.5
+        shufps    $238, %xmm14, %xmm6                           #88.5
+        subps     %xmm9, %xmm11                                 #88.5
+        addps     %xmm9, %xmm10                                 #88.5
+        xorps     %xmm3, %xmm11                                 #88.5
+        movaps    %xmm2, %xmm3                                  #88.5
+        shufps    $177, %xmm11, %xmm11                          #88.5
+        subps     %xmm10, %xmm3                                 #88.5
+        addps     %xmm10, %xmm2                                 #88.5
+        addps     %xmm11, %xmm12                                #88.5
+        subps     %xmm11, %xmm1                                 #88.5
+        shufps    $238, %xmm4, %xmm5                            #88.5
+        movaps    %xmm5, 48(%rdx,%r12,4)                        #88.5
+        movaps    %xmm6, 32(%rdx,%r12,4)                        #88.5
+        movaps    %xmm2, (%rdx,%r11,4)                          #88.5
+        movaps    %xmm1, 16(%rdx,%r11,4)                        #88.5
+        movaps    %xmm3, 32(%rdx,%r11,4)                        #88.5
+        movaps    %xmm12, 48(%rdx,%r11,4)                       #88.5
+	
+
+#ifdef __APPLE__
+	.globl _leaf_oe
+_leaf_oe:
+#else
+	.globl leaf_oe
+leaf_oe:
+#endif
+        movaps    (%r9), %xmm0           #59.5
+        #movaps    0x20(%r9), %xmm1           #59.5
+LEAF_OE_const_2:
+				movaps    0xFECA(%rsi,%rax,4), %xmm6                          #70.5
+LEAF_OE_const_3:
+        movaps    0xFECA(%rsi,%rax,4), %xmm8                           #70.5
+        movaps    %xmm6, %xmm10                                 #70.5
+        shufps    $228, %xmm8, %xmm10                           #70.5
+        movaps    %xmm10, %xmm9                                 #70.5
+        shufps    $228, %xmm6, %xmm8                            #70.5
+LEAF_OE_const_0:
+        movaps    0xFECA(%rsi,%rax,4), %xmm12                         #70.5
+LEAF_OE_const_1:
+        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #70.5
+        movaps    %xmm12, %xmm14                                #70.5
+        movslq    (%r8, %rax, 4), %r11                                   #83.44
+        addps     %xmm8, %xmm9                                  #70.5
+        subps     %xmm8, %xmm10                                 #70.5
+        addps     %xmm7, %xmm14                                 #70.5
+        subps     %xmm7, %xmm12                                 #70.5
+        movaps    %xmm9, %xmm4                                  #70.5
+        movaps    %xmm14, %xmm13                                #70.5
+        shufps    $238, %xmm10, %xmm4                           #70.5
+        xorps     %xmm0, %xmm10                                 #70.5
+        shufps    $177, %xmm10, %xmm10                          #70.5
+        movaps    %xmm12, %xmm11                                #70.5
+        movaps    %xmm14, %xmm5                                 #70.5
+        addps     %xmm9, %xmm13                                 #70.5
+        subps     %xmm10, %xmm11                                #70.5
+        subps     %xmm9, %xmm14                                 #70.5
+        shufps    $238, %xmm12, %xmm5                           #70.5
+        addps     %xmm10, %xmm12                                #70.5
+        movslq    8(%r8, %rax, 4), %r12                                  #83.59
+        movlhps   %xmm11, %xmm13                                #70.5
+        movaps    %xmm13, (%rdx,%r11,4)                         #70.5
+        movaps    0x30(%r9), %xmm13          #70.5
+        movlhps   %xmm12, %xmm14                                #70.5
+        movaps    0x40(%r9), %xmm12          #70.5
+        mulps     %xmm5, %xmm13                                 #70.5
+        shufps    $177, %xmm5, %xmm5                            #70.5
+        mulps     %xmm12, %xmm5                                 #70.5
+        movaps    %xmm14, 16(%rdx,%r11,4)                       #70.5
+        subps     %xmm5, %xmm13                                 #70.5
+        movaps    0x30(%r9), %xmm5           #70.5
+        mulps     %xmm4, %xmm5                                  #70.5
+        shufps    $177, %xmm4, %xmm4                            #70.5
+        mulps     %xmm12, %xmm4                                 #70.5
+LEAF_OE_const_4:
+        movaps    0xFECA(%rsi,%rax,4), %xmm9                          #70.5
+        addps     %xmm4, %xmm5                                  #70.5
+LEAF_OE_const_6:
+        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #70.5
+        movaps    %xmm9, %xmm3                                  #70.5
+LEAF_OE_const_7:
+        movaps    0xFECA(%rsi,%rax,4), %xmm2                          #70.5
+        movaps    %xmm7, %xmm6                                  #70.5
+LEAF_OE_const_5:
+        movaps    0xFECA(%rsi,%rax,4), %xmm15                         #70.5
+        movaps    %xmm13, %xmm4                                 #70.5
+        subps     %xmm2, %xmm7                                  #70.5
+        addps     %xmm15, %xmm3                                 #70.5
+        subps     %xmm15, %xmm9                                 #70.5
+        addps     %xmm2, %xmm6                                  #70.5
+        subps     %xmm5, %xmm13                                 #70.5
+        addps     %xmm5, %xmm4                                  #70.5
+        xorps     %xmm0, %xmm7                                  #70.5
+        addq      $4, %rax                                       #72.5
+        movaps    %xmm3, %xmm2                                  #70.5
+        shufps    $177, %xmm7, %xmm7                            #70.5
+        movaps    %xmm9, %xmm8                                  #70.5
+        xorps     %xmm0, %xmm13                                 #70.5
+        addps     %xmm6, %xmm2                                  #70.5
+        subps     %xmm7, %xmm8                                  #70.5
+        subps     %xmm6, %xmm3                                  #70.5
+        addps     %xmm7, %xmm9                                  #70.5
+        movaps    %xmm2, %xmm10                                 #70.5
+        movaps    %xmm3, %xmm11                                 #70.5
+        shufps    $238, %xmm8, %xmm2                            #70.5
+        shufps    $238, %xmm9, %xmm3                            #70.5
+        movaps    %xmm2, %xmm14                                 #70.5
+        shufps    $177, %xmm13, %xmm13                          #70.5
+        subps     %xmm4, %xmm14                                 #70.5
+        addps     %xmm4, %xmm2                                  #70.5
+        movaps    %xmm3, %xmm4                                  #70.5
+        subps     %xmm13, %xmm3                                 #70.5
+        addps     %xmm13, %xmm4                                 #70.5
+        movlhps   %xmm8, %xmm10                                 #70.5
+        movlhps   %xmm9, %xmm11                                 #70.5
+        movaps    %xmm10, 32(%rdx,%r11,4)                       #70.5
+        movaps    %xmm11, 48(%rdx,%r11,4)                       #70.5
+        movaps    %xmm2, (%rdx,%r12,4)                          #70.5
+        movaps    %xmm3, 16(%rdx,%r12,4)                        #70.5
+        movaps    %xmm14, 32(%rdx,%r12,4)                       #70.5
+        movaps    %xmm4, 48(%rdx,%r12,4)                        #70.5
+	
+	
+#ifdef __APPLE__
+	.globl	_leaf_end
+_leaf_end:
+#else
+	.globl	leaf_end
+leaf_end:
+#endif
+
+#ifdef __APPLE__
+	.globl	_x_init
+_x_init:
+#else
+	.globl	x_init
+x_init:
+#endif
+        #movaps    L_sse_constants(%rip), %xmm3           #34.3
+				movaps   (%r9), %xmm3           #34.3
+				movq        0x20(%rdi),%r8
+#ifdef __APPLE__
+	.globl	_x4
+_x4:
+#else	
+	.globl	x4
+x4:
+#endif
+        movaps    64(%rdx), %xmm0                               #34.3
+        movaps    96(%rdx), %xmm1                               #34.3
+        movaps    (%rdx), %xmm7                                 #34.3
+        movaps    (%r8), %xmm4      #const
+        movaps    %xmm7, %xmm9                                  #34.3
+        movaps    %xmm4, %xmm6                                  #34.3
+        movaps    16(%r8), %xmm2      #const
+        mulps     %xmm0, %xmm6                                  #34.3
+        mulps     %xmm1, %xmm4                                  #34.3
+        shufps    $177, %xmm0, %xmm0                            #34.3
+        shufps    $177, %xmm1, %xmm1                            #34.3
+        mulps     %xmm2, %xmm0                                  #34.3
+        mulps     %xmm1, %xmm2                                  #34.3
+        subps     %xmm0, %xmm6                                  #34.3
+        addps     %xmm2, %xmm4                                  #34.3
+        movaps    %xmm6, %xmm5                                  #34.3
+        subps     %xmm4, %xmm6                                  #34.3
+        addps     %xmm4, %xmm5                                  #34.3
+        movaps    32(%rdx), %xmm8                               #34.3
+        xorps     %xmm3, %xmm6                                  #34.3
+        shufps    $177, %xmm6, %xmm6                            #34.3
+        movaps    %xmm8, %xmm10                                 #34.3
+        movaps    112(%rdx), %xmm12                             #34.3
+        subps     %xmm5, %xmm9                                  #34.3
+        addps     %xmm5, %xmm7                                  #34.3
+        addps     %xmm6, %xmm10                                 #34.3
+        subps     %xmm6, %xmm8                                  #34.3
+        movaps    %xmm7, (%rdx)                                 #34.3
+        movaps    %xmm8, 32(%rdx)                               #34.3
+        movaps    %xmm9, 64(%rdx)                               #34.3
+        movaps    %xmm10, 96(%rdx)                              #34.3
+        movaps    32(%r8), %xmm14    #const                          #34.3
+        movaps    80(%rdx), %xmm11                              #34.3
+        movaps    %xmm14, %xmm0                                 #34.3
+        movaps    48(%r8), %xmm13    #const                          #34.3
+        mulps     %xmm11, %xmm0                                 #34.3
+        mulps     %xmm12, %xmm14                                #34.3
+        shufps    $177, %xmm11, %xmm11                          #34.3
+        shufps    $177, %xmm12, %xmm12                          #34.3
+        mulps     %xmm13, %xmm11                                #34.3
+        mulps     %xmm12, %xmm13                                #34.3
+        subps     %xmm11, %xmm0                                 #34.3
+        addps     %xmm13, %xmm14                                #34.3
+        movaps    %xmm0, %xmm15                                 #34.3
+        subps     %xmm14, %xmm0                                 #34.3
+        addps     %xmm14, %xmm15                                #34.3
+        xorps     %xmm3, %xmm0                                  #34.3
+        movaps    16(%rdx), %xmm1                               #34.3
+        movaps    48(%rdx), %xmm2                               #34.3
+        movaps    %xmm1, %xmm4                                  #34.3
+        shufps    $177, %xmm0, %xmm0                            #34.3
+        movaps    %xmm2, %xmm5                                  #34.3
+        addps     %xmm15, %xmm1                                 #34.3
+        subps     %xmm0, %xmm2                                  #34.3
+        subps     %xmm15, %xmm4                                 #34.3
+        addps     %xmm0, %xmm5                                  #34.3
+        movaps    %xmm1, 16(%rdx)                               #34.3
+        movaps    %xmm2, 48(%rdx)                               #34.3
+        movaps    %xmm4, 80(%rdx)                               #34.3
+        movaps    %xmm5, 112(%rdx)                              #34.3
+				ret	
+	
+# _x8_soft + 5 needs to be 16 byte aligned
+#ifdef __APPLE__
+	.globl	_x8_soft
+_x8_soft:
+#else
+	.globl	x8_soft
+x8_soft:
+#endif
+	xorl %eax, %eax
+				movq      %rdx, %rbx     
+        movq      %r8, %rsi
+				leaq       (%rdx,%rcx,4), %r9  
+        leaq       (%r9,%rcx,4), %r10 
+        leaq       (%r10,%rcx,4), %r11 
+        leaq       (%r11,%rcx,4), %r12
+        leaq       (%r12,%rcx,4), %r13
+        leaq       (%r13,%rcx,4), %r14
+        leaq       (%r14,%rcx,4), %r15
+X8_soft_loop:   
+        movaps    (%rsi), %xmm9       
+        movaps    (%r10,%rax,4), %xmm6 
+        movaps    %xmm9, %xmm11        
+        movaps    (%r11,%rax,4), %xmm7 
+        movaps    16(%rsi), %xmm8      
+        mulps     %xmm6, %xmm11      
+        mulps     %xmm7, %xmm9       
+        shufps    $177, %xmm6, %xmm6 
+        mulps     %xmm8, %xmm6       
+        shufps    $177, %xmm7, %xmm7 
+        subps     %xmm6, %xmm11   
+        mulps     %xmm7, %xmm8     
+        movaps    %xmm11, %xmm10    
+        addps     %xmm8, %xmm9       
+        movaps    32(%rsi), %xmm15    
+        addps     %xmm9, %xmm10        
+        subps     %xmm9, %xmm11        
+        movaps    (%rbx,%rax,4), %xmm5 
+        movaps    %xmm15, %xmm6        
+        movaps    (%r12,%rax,4), %xmm12
+        movaps    %xmm5, %xmm2         
+        movaps    (%r14,%rax,4), %xmm13
+        xorps     %xmm3, %xmm11     #const   
+        movaps    48(%rsi), %xmm14     
+        subps     %xmm10, %xmm2        
+        mulps     %xmm12, %xmm6        
+        addps     %xmm10, %xmm5        
+        mulps     %xmm13, %xmm15       
+        movaps    64(%rsi), %xmm10     
+        movaps    %xmm5, %xmm0         
+        shufps    $177, %xmm12, %xmm12 
+        shufps    $177, %xmm13, %xmm13 
+        mulps     %xmm14, %xmm12       
+        mulps     %xmm13, %xmm14       
+        subps     %xmm12, %xmm6        
+        addps     %xmm14, %xmm15       
+        movaps    (%r13,%rax,4), %xmm7  
+        movaps    %xmm10, %xmm13         
+        movaps    (%r15,%rax,4), %xmm8    
+        movaps    %xmm6, %xmm12      
+        movaps    80(%rsi), %xmm9     
+        addq      $96, %rsi           
+        mulps     %xmm7, %xmm13      
+        subps     %xmm15, %xmm6      
+        addps     %xmm15, %xmm12     
+        mulps     %xmm8, %xmm10      
+        subps     %xmm12, %xmm0          
+        addps     %xmm12, %xmm5          
+        shufps    $177, %xmm7, %xmm7     
+        xorps     %xmm3, %xmm6   #const        
+        shufps    $177, %xmm8, %xmm8     
+        movaps    %xmm2, %xmm12          
+        mulps     %xmm9, %xmm7           
+        mulps     %xmm8, %xmm9          
+        subps     %xmm7, %xmm13        
+        addps     %xmm9, %xmm10       
+        movaps    (%r9,%rax,4), %xmm4        
+        shufps    $177, %xmm11, %xmm11       
+        movaps    %xmm4, %xmm1              
+        shufps    $177, %xmm6, %xmm6       
+        addps     %xmm11, %xmm1           
+        subps     %xmm11, %xmm4          
+        addps     %xmm6, %xmm12         
+        subps     %xmm6, %xmm2         
+        movaps    %xmm13, %xmm11      
+        movaps    %xmm4, %xmm14      
+        movaps    %xmm1, %xmm6      
+        subps     %xmm10, %xmm13   
+        addps     %xmm10, %xmm11  
+        xorps     %xmm3, %xmm13  #const  
+        addps     %xmm11, %xmm4                                 
+        subps     %xmm11, %xmm14                                
+        shufps    $177, %xmm13, %xmm13                          
+        movaps    %xmm5, (%rbx,%rax,4)                                 
+        movaps    %xmm4, (%r9,%rax,4)                                  
+        movaps    %xmm2, (%r10,%rax,4)                                 
+        subps     %xmm13, %xmm1                                 
+        addps     %xmm13, %xmm6                                 
+        movaps    %xmm1, (%r11,%rax,4)                                  
+        movaps    %xmm0, (%r12,%rax,4)                                 
+        movaps    %xmm14, (%r13,%rax,4)                                
+        movaps    %xmm12, (%r14,%rax,4)                                
+        movaps    %xmm6, (%r15,%rax,4)                                 
+        addq      $4, %rax   
+				cmpq	%rcx, %rax
+        jne       X8_soft_loop
+				ret
+
+#ifdef __APPLE__
+	.globl	_x8_hard
+_x8_hard:
+#else
+	.globl	x8_hard
+x8_hard:
+#endif
+        movaps    (%r9), %xmm5           
+X8_loop:  
+        movaps    (%r8), %xmm9                                 
+X8_const_2:
+        movaps    0xFECA(%rdx,%rax,4), %xmm6  
+        movaps    %xmm9, %xmm11                                 
+X8_const_3:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7  
+        movaps    16(%r8), %xmm8                               
+        mulps     %xmm6, %xmm11                                 
+        mulps     %xmm7, %xmm9                                  
+        shufps    $177, %xmm6, %xmm6                            
+        mulps     %xmm8, %xmm6                                  
+        shufps    $177, %xmm7, %xmm7                            
+        subps     %xmm6, %xmm11                                 
+        mulps     %xmm7, %xmm8                                  
+        movaps    %xmm11, %xmm10                                
+        addps     %xmm8, %xmm9                                  
+        movaps    32(%r8), %xmm15                              
+        addps     %xmm9, %xmm10                                 
+        subps     %xmm9, %xmm11                                 
+X8_const_0:
+        movaps    0xFECA(%rdx,%rax,4), %xmm3     
+        movaps    %xmm15, %xmm6                                 
+X8_const_4:
+        movaps    0xFECA(%rdx,%rax,4), %xmm12
+        movaps    %xmm3, %xmm2                                  
+X8_const_6:
+        movaps    0xFECA(%rdx,%rax,4), %xmm13
+        xorps     %xmm5, %xmm11                                 
+        movaps    48(%r8), %xmm14                              
+        subps     %xmm10, %xmm2                                 
+        mulps     %xmm12, %xmm6                                 
+        addps     %xmm10, %xmm3                                 
+        mulps     %xmm13, %xmm15                                
+        movaps    64(%r8), %xmm10                              
+        movaps    %xmm3, %xmm0                                  
+        shufps    $177, %xmm12, %xmm12                          
+        shufps    $177, %xmm13, %xmm13                          
+        mulps     %xmm14, %xmm12                                
+        mulps     %xmm13, %xmm14                                
+        subps     %xmm12, %xmm6                                 
+        addps     %xmm14, %xmm15                                
+X8_const_5:
+        movaps    0xFECA(%rdx,%rax,4), %xmm7
+        movaps    %xmm10, %xmm13                                
+X8_const_7:
+        movaps    0xFECA(%rdx,%rax,4), %xmm8
+        movaps    %xmm6, %xmm12                                 
+        movaps    80(%r8), %xmm9                               
+        addq      $96, %r8                                     
+        mulps     %xmm7, %xmm13                                 
+        subps     %xmm15, %xmm6                                 
+        addps     %xmm15, %xmm12                                
+        mulps     %xmm8, %xmm10                                 
+        subps     %xmm12, %xmm0                                 
+        addps     %xmm12, %xmm3                                 
+        shufps    $177, %xmm7, %xmm7                            
+        xorps     %xmm5, %xmm6                                  
+        shufps    $177, %xmm8, %xmm8                            
+        movaps    %xmm2, %xmm12                                 
+        mulps     %xmm9, %xmm7                                  
+        mulps     %xmm8, %xmm9                                  
+        subps     %xmm7, %xmm13                                 
+        addps     %xmm9, %xmm10                                 
+X8_const_1:
+        movaps    0xFECA(%rdx,%rax,4), %xmm4   
+        shufps    $177, %xmm11, %xmm11                          
+        movaps    %xmm4, %xmm1                                  
+        shufps    $177, %xmm6, %xmm6                            
+        addps     %xmm11, %xmm1                                 
+        subps     %xmm11, %xmm4                                 
+        addps     %xmm6, %xmm12                                 
+        subps     %xmm6, %xmm2                                  
+        movaps    %xmm13, %xmm11                                
+        movaps    %xmm4, %xmm14                                 
+        movaps    %xmm1, %xmm6                                  
+        subps     %xmm10, %xmm13                                
+        addps     %xmm10, %xmm11                                
+        xorps     %xmm5, %xmm13                                 
+        addps     %xmm11, %xmm4                                 
+        subps     %xmm11, %xmm14                                
+        shufps    $177, %xmm13, %xmm13                          
+X8_const1_0:
+        movaps    %xmm3, 0xFECA(%rdx,%rax,4)
+X8_const1_1:
+        movaps    %xmm4, 0xFECA(%rdx,%rax,4)
+X8_const1_2:
+        movaps    %xmm2, 0xFECA(%rdx,%rax,4) 
+        subps     %xmm13, %xmm1                                 
+        addps     %xmm13, %xmm6                                 
+X8_const1_3:
+        movaps    %xmm1, 0xFECA(%rdx,%rax,4) 
+X8_const1_4:
+        movaps    %xmm0, 0xFECA(%rdx,%rax,4)
+X8_const1_5:
+        movaps    %xmm14, 0xFECA(%rdx,%rax,4)
+X8_const1_6:
+        movaps    %xmm12, 0xFECA(%rdx,%rax,4) 
+X8_const1_7:
+        movaps    %xmm6, 0xFECA(%rdx,%rax,4)
+        addq      $4, %rax   
+				cmpq	%rcx, %rax
+        jne       X8_loop
+
+#ifdef __APPLE__	
+	.globl _sse_leaf_ee_offsets
+	.globl _sse_leaf_oo_offsets
+	.globl _sse_leaf_eo_offsets
+	.globl _sse_leaf_oe_offsets
+	.align 4
+_sse_leaf_ee_offsets:
+	.long LEAF_EE_const_0-_leaf_ee+0x4
+	.long LEAF_EE_const_1-_leaf_ee+0x5
+	.long LEAF_EE_const_2-_leaf_ee+0x5
+	.long LEAF_EE_const_3-_leaf_ee+0x5
+	.long LEAF_EE_const_4-_leaf_ee+0x5
+	.long LEAF_EE_const_5-_leaf_ee+0x5
+	.long LEAF_EE_const_6-_leaf_ee+0x4
+	.long LEAF_EE_const_7-_leaf_ee+0x5
+_sse_leaf_oo_offsets:
+	.long LEAF_OO_const_0-_leaf_oo+0x4
+	.long LEAF_OO_const_1-_leaf_oo+0x4
+	.long LEAF_OO_const_2-_leaf_oo+0x5
+	.long LEAF_OO_const_3-_leaf_oo+0x5
+	.long LEAF_OO_const_4-_leaf_oo+0x4
+	.long LEAF_OO_const_5-_leaf_oo+0x5
+	.long LEAF_OO_const_6-_leaf_oo+0x5
+	.long LEAF_OO_const_7-_leaf_oo+0x5
+_sse_leaf_eo_offsets:
+	.long LEAF_EO_const_0-_leaf_eo+0x5
+	.long LEAF_EO_const_1-_leaf_eo+0x4
+	.long LEAF_EO_const_2-_leaf_eo+0x4
+	.long LEAF_EO_const_3-_leaf_eo+0x4
+	.long LEAF_EO_const_4-_leaf_eo+0x5
+	.long LEAF_EO_const_5-_leaf_eo+0x5
+	.long LEAF_EO_const_6-_leaf_eo+0x4
+	.long LEAF_EO_const_7-_leaf_eo+0x5
+_sse_leaf_oe_offsets:
+	.long LEAF_OE_const_0-_leaf_oe+0x5
+	.long LEAF_OE_const_1-_leaf_oe+0x4
+	.long LEAF_OE_const_2-_leaf_oe+0x4
+	.long LEAF_OE_const_3-_leaf_oe+0x5
+	.long LEAF_OE_const_4-_leaf_oe+0x5
+	.long LEAF_OE_const_5-_leaf_oe+0x5
+	.long LEAF_OE_const_6-_leaf_oe+0x4
+	.long LEAF_OE_const_7-_leaf_oe+0x4
+#else
+	.globl sse_leaf_ee_offsets
+	.globl sse_leaf_oo_offsets
+	.globl sse_leaf_eo_offsets
+	.globl sse_leaf_oe_offsets
+	.align 4
+sse_leaf_ee_offsets:
+	.long LEAF_EE_const_0-leaf_ee+0x4
+	.long LEAF_EE_const_1-leaf_ee+0x5
+	.long LEAF_EE_const_2-leaf_ee+0x5
+	.long LEAF_EE_const_3-leaf_ee+0x5
+	.long LEAF_EE_const_4-leaf_ee+0x5
+	.long LEAF_EE_const_5-leaf_ee+0x5
+	.long LEAF_EE_const_6-leaf_ee+0x4
+	.long LEAF_EE_const_7-leaf_ee+0x5
+sse_leaf_oo_offsets:
+	.long LEAF_OO_const_0-leaf_oo+0x4
+	.long LEAF_OO_const_1-leaf_oo+0x4
+	.long LEAF_OO_const_2-leaf_oo+0x5
+	.long LEAF_OO_const_3-leaf_oo+0x5
+	.long LEAF_OO_const_4-leaf_oo+0x4
+	.long LEAF_OO_const_5-leaf_oo+0x5
+	.long LEAF_OO_const_6-leaf_oo+0x5
+	.long LEAF_OO_const_7-leaf_oo+0x5
+sse_leaf_eo_offsets:
+	.long LEAF_EO_const_0-leaf_eo+0x5
+	.long LEAF_EO_const_1-leaf_eo+0x4
+	.long LEAF_EO_const_2-leaf_eo+0x4
+	.long LEAF_EO_const_3-leaf_eo+0x4
+	.long LEAF_EO_const_4-leaf_eo+0x5
+	.long LEAF_EO_const_5-leaf_eo+0x5
+	.long LEAF_EO_const_6-leaf_eo+0x4
+	.long LEAF_EO_const_7-leaf_eo+0x5
+sse_leaf_oe_offsets:
+	.long LEAF_OE_const_0-leaf_oe+0x5
+	.long LEAF_OE_const_1-leaf_oe+0x4
+	.long LEAF_OE_const_2-leaf_oe+0x4
+	.long LEAF_OE_const_3-leaf_oe+0x5
+	.long LEAF_OE_const_4-leaf_oe+0x5
+	.long LEAF_OE_const_5-leaf_oe+0x5
+	.long LEAF_OE_const_6-leaf_oe+0x4
+	.long LEAF_OE_const_7-leaf_oe+0x4
+#endif
+
+#ifdef __APPLE__
+	.data
+#else
+	.section .data
+#endif
+	.p2align 4
+#ifdef __APPLE__	
+	.globl _sse_constants
+_sse_constants:
+#else
+	.globl sse_constants
+sse_constants:
+#endif
+	.long	0x00000000,0x80000000,0x00000000,0x80000000
+	.long	0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+	.long	0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+	.long	0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+	.long	0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__	
+	.globl _sse_constants_inv
+_sse_constants_inv:
+#else
+	.globl sse_constants_inv
+sse_constants_inv:
+#endif
+	.long	0x80000000,0x00000000,0x80000000,0x00000000
+	.long	0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+	.long	0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+	.long	0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+	.long	0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
--- a/3rdparty/ffts/ffts-master/src/types.h
+++ b/3rdparty/ffts/ffts-master/src/types.h
@@ -0,0 +1,50 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+#ifndef __TYPES_H__
+#define __TYPES_H__
+
+#define __INLINE static inline __attribute__((always_inline))
+
+#if defined(complex)
+	typedef complex float cdata_t;
+#else
+	typedef float cdata_t[2];
+#endif
+	typedef float data_t;
+
+#endif
+
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/vfp.h
+++ b/3rdparty/ffts/ffts-master/src/vfp.h
@@ -0,0 +1,46 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, 2013 The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef __VFP_H__
+#define __VFP_H__
+
+#include "ffts.h"
+
+void vfp_e();
+void vfp_o();
+void vfp_x4();
+void vfp_x8();
+void vfp_end();
+
+#endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/3rdparty/ffts/ffts-master/src/vfp.s
+++ b/3rdparty/ffts/ffts-master/src/vfp.s
@@ -0,0 +1,473 @@
+/*
+ 
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+  
+ Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, 2013 The University of Waikato 
+ 
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+@ assumes r0 = out 
+@         r1 = in ? 
+@
+@         r12 = offsets
+@         r3-r10 = data pointers
+@         r11 = loop iterations
+@         r2 = const pointer
+@       & lr = temps
+
+	.align 4
+#ifdef __APPLE__
+	.globl	_vfp_e
+_vfp_e:
+#else
+	.globl	vfp_e
+vfp_e:
+#endif
+_vfp_e_loop:
+	vldr	s15, [r2, #8]
+	vldr s2, [r3] @ x0 
+	vldr	s0, [r3, #4]
+	vldr s4, [r4] @ x1 
+	vldr	s11, [r2]
+	vldr s10, [r7] @ x4 
+	vldr	s3, [r7, #4]
+	vldr s8, [r8] @ x5 
+	vldr	s1, [r8, #4]
+	vldr s14, [r9] @ x6 
+	vldr	s9, [r9, #4]
+	vldr s6, [r10] @ x7 
+	vldr	s12, [r10, #4]
+	vsub.f32	s18, s3, s1
+	vsub.f32	s7, s10, s8
+	vsub.f32	s5, s14, s6
+	vadd.f32	s6, s14, s6
+	vldr	s24, [r5, #4]
+	vsub.f32	s14, s9, s12
+	vldr	s22, [r6, #4]
+	vadd.f32	s8, s10, s8
+	vldr s28, [r6] @ x3 
+	vldr s17, [r5] @ x2 
+	vadd.f32	s10, s9, s12
+	vmul.f32	s13, s18, s15
+	vmul.f32	s9, s7, s11
+	vmul.f32	s16, s5, s11
+	vmul.f32	s18, s18, s11
+	vmul.f32	s30, s14, s11
+	vldr	s11, [r4, #4]
+    add r3, r3, #8
+    add r4, r4, #8
+    add r5, r5, #8
+    add r6, r6, #8
+    add r7, r7, #8
+    add r8, r8, #8
+    add r9, r9, #8
+    add r10, r10, #8
+	vmul.f32	s12, s5, s15
+	vmul.f32	s20, s14, s15
+	vadd.f32	s5, s2, s4
+	vadd.f32	s3, s3, s1
+	vmul.f32	s15, s7, s15
+	vadd.f32	s1, s24, s22
+	vsub.f32	s7, s24, s22
+	vadd.f32	s24, s17, s28
+	vadd.f32	s26, s0, s11
+	vsub.f32	s14, s9, s13
+	vsub.f32	s2, s2, s4
+	vadd.f32	s4, s16, s20
+	vsub.f32	s22, s0, s11
+	vsub.f32	s16, s17, s28
+	vadd.f32	s9, s5, s24
+	vadd.f32	s28, s18, s15
+	vadd.f32	s13, s8, s6
+	vsub.f32	s5, s5, s24
+	vsub.f32	s24, s8, s6
+	vadd.f32	s11, s26, s1
+	vsub.f32	s12, s30, s12
+	vadd.f32	s20, s3, s10
+	vsub.f32	s15, s3, s10
+	vsub.f32	s3, s26, s1      
+	vadd.f32	s18, s9, s13
+	vadd.f32	s10, s14, s4
+	vadd.f32	s6, s2, s7      @
+	vsub.f32	s0, s2, s7      @
+	vadd.f32	s26, s11, s20
+	vsub.f32	s4, s14, s4
+	vsub.f32	s8, s22, s16    @
+	vadd.f32	s1, s28, s12
+ldr lr, [r12], #4
+add lr, r0, lr, lsl #2
+subs	r11, r11, #1
+	vstr	s18, [lr]
+	vsub.f32	s2, s28, s12
+	vadd.f32	s12, s22, s16   @
+	vsub.f32	s16, s3, s24    @
+	vsub.f32	s13, s9, s13
+	vstr	s26, [lr, #4]
+	vadd.f32	s28, s5, s15    @
+	vsub.f32	s7, s5, s15     @
+	vadd.f32	s14, s6, s10
+	vadd.f32	s5, s8, s1      
+	vadd.f32	s9, s0, s2      @
+	vsub.f32	s2, s0, s2      @
+	vsub.f32	s11, s11, s20
+	vstr	s28, [lr, #16]
+	vadd.f32	s3, s3, s24     @
+	vstr	s16, [lr, #20]
+	vsub.f32	s6, s6, s10
+	vstr	s13, [lr, #32]
+	vsub.f32	s13, s12, s4    @
+	vsub.f32	s8, s8, s1
+	vadd.f32	s0, s12, s4     @
+	vstr	s11, [lr, #36]
+	vstr	s7, [lr, #48]
+	vstr	s3, [lr, #52]
+	vstr	s14, [lr, #8]
+	vstr	s5, [lr, #12]
+	vstr	s9, [lr, #24]
+	vstr	s13, [lr, #28]
+	vstr	s6, [lr, #40]
+	vstr	s8, [lr, #44]
+	vstr	s2, [lr, #56]
+	vstr	s0, [lr, #60]
+	bne _vfp_e_loop
+
+@ assumes r0 = out 
+@         r1 = in ? 
+@
+@         r12 = offsets
+@         r3-r10 = data pointers
+@         r11 = loop iterations
+@         r2 & lr = temps
+	.align 4
+#ifdef __APPLE__
+	.globl	_vfp_o
+_vfp_o:
+#else
+	.globl	vfp_o
+vfp_o:
+#endif
+ _vfp_o_loop: 
+	vldr s4, [r3] @ x0 
+	vldr	s0, [r3, #4]
+	vldr s6, [r4] @ x1 
+	vldr	s5, [r4, #4]
+	vldr s7, [r5] @ x2 
+	vldr	s1, [r5, #4]
+	vldr s3, [r6] @ x3 
+	vldr	s8, [r6, #4]
+   subs r11, r11, #1   
+	 ldr r2, [r12], #4
+	 add r2, r0, r2, lsl #2
+	vadd.f32	s2, s4, s6
+	vadd.f32	s14, s0, s5
+	vadd.f32	s10, s1, s8
+	vsub.f32	s4, s4, s6
+	vsub.f32	s0, s0, s5
+	vadd.f32	s12, s7, s3
+	vsub.f32	s6, s7, s3
+	vsub.f32	s8, s1, s8
+	vadd.f32	s5, s14, s10
+	vsub.f32	s10, s14, s10
+	vadd.f32	s7, s2, s12
+	vsub.f32	s1, s0, s6     @
+	vsub.f32	s12, s2, s12
+	vadd.f32	s3, s4, s8     @
+	vsub.f32	s2, s4, s8     @
+	vadd.f32	s0, s0, s6     @
+	vstr	s7, [r2]
+	vldr s7, [r9] @ x2 
+	vstr	s5, [r2, #4]
+	vstr	s3, [r2, #8]
+	vstr	s1, [r2, #12]
+	vstr	s12, [r2, #16]
+	vstr	s10, [r2, #20]
+	vstr	s2, [r2, #24]
+	vstr	s0, [r2, #28]
+	vldr s4, [r7] @ x0 
+	vldr	s0, [r7, #4]
+	vldr s6, [r8] @ x1 
+	vldr	s5, [r8, #4]
+	vldr s3, [r10] @ x3 
+	vldr	s8, [r10, #4]
+	vldr	s1, [r9, #4]
+    add r3, r3, #8
+    add r4, r4, #8
+    add r5, r5, #8
+    add r6, r6, #8
+    add r7, r7, #8
+    add r8, r8, #8
+    add r9, r9, #8
+    add r10, r10, #8
+	vadd.f32	s2, s4, s6
+	vadd.f32	s14, s0, s5
+	vadd.f32	s10, s1, s8
+	vsub.f32	s4, s4, s6
+	vsub.f32	s0, s0, s5
+	vadd.f32	s12, s7, s3
+	vsub.f32	s6, s7, s3
+	vsub.f32	s8, s1, s8
+	vadd.f32	s5, s14, s10
+	vsub.f32	s10, s14, s10
+	vadd.f32	s7, s2, s12
+	vsub.f32	s1, s0, s6     @
+	vsub.f32	s12, s2, s12
+	vadd.f32	s3, s4, s8     @
+	vsub.f32	s2, s4, s8     @
+	vadd.f32	s0, s0, s6     @
+	vstr	s7, [r2, #32]
+	vstr	s5, [r2, #36]
+	vstr	s3, [r2, #40]
+	vstr	s1, [r2, #44]
+	vstr	s12, [r2, #48]
+	vstr	s10, [r2, #52]
+	vstr	s2, [r2, #56]
+	vstr	s0, [r2, #60]
+    bne _vfp_o_loop
+	
+	.align	4
+#ifdef __APPLE__
+	.globl	_vfp_x4
+_vfp_x4:
+#else
+	.globl	vfp_x4
+vfp_x4:
+#endif
+	add r3, r0, #0
+	add r7, r2, #0
+	add r4, r0, r1, lsl #1
+	add r5, r0, r1, lsl #2
+	add r6, r4, r1, lsl #2
+	mov r11, #4	
+_vfp_x4_loop:
+
+	vldr	s8, [r3, #0]
+	vldr	s9, [r3, #4]
+	vldr	s10, [r4, #0]
+	vldr	s11, [r4, #4]
+	vldr	s12, [r5, #0]
+	vldr	s13, [r5, #4]
+	vldr	s14, [r6, #0]
+	vldr	s15, [r6, #4]
+	vldr	s2, [r7, #0]
+	vldr	s3, [r7, #4]
+    add r7, r7, #8	
+   subs r11, r11, #1   
+	vmul.f32	s0, s13, s3
+	vmul.f32	s5, s12, s2
+	vmul.f32	s1, s14, s2
+	vmul.f32	s4, s14, s3
+	vmul.f32	s14, s12, s3
+	vmul.f32	s13, s13, s2
+	vmul.f32	s12, s15, s3
+	vmul.f32	s2, s15, s2
+	vsub.f32	s0, s5, s0
+	vadd.f32	s13, s13, s14
+	vadd.f32	s12, s12, s1
+	vsub.f32	s1, s2, s4
+	vadd.f32	s15, s0, s12
+	vsub.f32	s12, s0, s12
+	vadd.f32	s14, s13, s1
+	vsub.f32	s13, s13, s1
+	vadd.f32	s0, s8, s15
+	vadd.f32	s1, s9, s14
+	vadd.f32	s2, s10, s13  @
+	vsub.f32	s4, s8, s15
+	vsub.f32	s3, s11, s12  @
+	vstr	s0, [r3, #0]
+	vstr	s1, [r3, #4]
+	 add r3, r3, #8 
+	vsub.f32	s5, s9, s14
+	vsub.f32	s6, s10, s13  @
+	vadd.f32	s7, s11, s12  @
+	vstr	s2, [r4, #0]
+	vstr	s3, [r4, #4]
+	 add r4, r4, #8 
+	vstr	s4, [r5, #0]
+	vstr	s5, [r5, #4]
+	 add r5, r5, #8 
+	vstr	s6, [r6, #0]
+	vstr	s7, [r6, #4]
+	 add r6, r6, #8 
+	bne _vfp_x4_loop	
+	bx lr
+	
+	.align 4
+#ifdef __APPLE__
+	.globl	_vfp_x8
+_vfp_x8:
+#else
+	.globl	vfp_x8
+vfp_x8:
+#endif
+	mov r11, #0
+	add r3, r0, #0           @ data0
+	add r5, r0, r1, lsl #1   @ data2
+	add r4, r0, r1           @ data1
+	add r7, r5, r1, lsl #1   @ data4
+	add r6, r5, r1           @ data3
+	add r9, r7, r1, lsl #1   @ data6
+	add r8, r7, r1           @ data5
+	add r10, r9, r1          @ data7
+	add r12, r2, #0          @ LUT
+
+	sub r11, r11, r1, lsr #3
+_vfp_x8_loop: 
+	vldr s10, [r3, #0] @ x0-re 
+	vldr s8, [r3, #4] @ x0-im 
+	vldr s2, [r4, #0] @ x1-re 
+	vldr s0, [r4, #4] @ x1-im 
+	vldr s6, [r5, #0] @ x2-re 
+	vldr s4, [r5, #4] @ x2-im 
+	vldr s13, [r6, #0] @ x3-re 
+	vldr s15, [r6, #4] @ x3-im 
+	vldr	s7, [r12]
+	vldr	s11, [r12, #4]
+	vldr s5, [r7, #0] @ x4-re 
+	vldr s1, [r7, #4] @ x4-im 
+	vldr s28, [r9, #0] @ x6-re 
+	vldr s18, [r9, #4] @ x6-im 
+    adds	r11, r11, #1
+	vmul.f32	s14, s15, s7
+	vldr	s24, [r12, #12]
+	vmul.f32	s12, s13, s11
+	vmul.f32	s26, s13, s7
+	vldr	s13, [r12, #8]
+	vmul.f32	s3, s4, s11
+	vmul.f32	s15, s15, s11
+	vmul.f32	s16, s4, s7
+	vmul.f32	s9, s6, s7
+	vmul.f32	s11, s6, s11
+	vmul.f32	s7, s18, s24
+	vmul.f32	s20, s1, s24
+	vmul.f32	s30, s5, s13
+	vadd.f32	s4, s26, s15
+	vsub.f32	s12, s14, s12
+	vsub.f32	s6, s9, s3
+	vadd.f32	s14, s16, s11
+	vmul.f32	s22, s28, s13
+	vmul.f32	s26, s28, s24
+	vmul.f32	s18, s18, s13
+	vmul.f32	s5, s5, s24
+	vmul.f32	s1, s1, s13
+	vsub.f32	s9, s30, s20
+	vadd.f32	s16, s14, s12
+	vadd.f32	s3, s22, s7
+	vadd.f32	s15, s6, s4
+	vsub.f32	s11, s18, s26
+	vadd.f32	s18, s1, s5
+	vadd.f32	s13, s8, s16
+	vadd.f32	s1, s9, s3
+	vadd.f32	s7, s10, s15
+	vsub.f32	s15, s10, s15
+	vsub.f32	s10, s9, s3
+	vadd.f32	s5, s18, s11
+	vsub.f32	s11, s18, s11
+	vsub.f32	s8, s8, s16
+	vadd.f32	s20, s7, s1
+	vsub.f32	s7, s7, s1
+	vadd.f32	s18, s13, s5
+	vadd.f32	s16, s15, s11   @
+	vsub.f32	s9, s8, s10     @
+	vsub.f32	s3, s13, s5
+	vsub.f32	s1, s15, s11    @
+	vstr	s20, [r3]
+	vadd.f32	s8, s8, s10     @
+	vstr	s18, [r3, #4]
+	add r3, r3, #8
+	vstr	s16, [r5]
+	vstr	s9, [r5, #4]
+	add r5, r5, #8
+	vstr	s7, [r7]
+	vstr	s3, [r7, #4]
+	add r7, r7, #8
+	vstr	s1, [r9]
+	vstr	s8, [r9, #4]
+	add r9, r9, #8
+	vldr s10, [r8, #0] @ x5-re 
+	vldr s8, [r8, #4] @ x5-im 
+	vldr s5, [r10, #0] @ x7-re 
+	vldr s11, [r10, #4] @ x7-im 
+	vldr	s1, [r12, #16]
+	vldr	s15, [r12, #20]
+	add	r12, r12, #24
+	vmul.f32	s9, s5, s1
+	vmul.f32	s3, s11, s15
+	vmul.f32	s13, s10, s1
+	vmul.f32	s7, s8, s15
+	vmul.f32	s5, s5, s15
+	vmul.f32	s11, s11, s1
+	vmul.f32	s10, s10, s15
+	vmul.f32	s15, s8, s1
+	vsub.f32	s1, s14, s12
+	vadd.f32	s8, s9, s3
+	vsub.f32	s3, s6, s4
+	vsub.f32	s12, s13, s7
+	vsub.f32	s5, s11, s5
+	vadd.f32	s7, s15, s10
+	vadd.f32	s4, s2, s1    @
+	vsub.f32	s2, s2, s1    @
+	vsub.f32	s6, s0, s3    @
+	vadd.f32	s10, s12, s8
+	vsub.f32	s9, s12, s8
+	vadd.f32	s0, s0, s3    @ 
+	vsub.f32	s1, s7, s5
+	vadd.f32	s14, s7, s5
+	vadd.f32	s7, s4, s10
+	vsub.f32	s8, s4, s10
+	vsub.f32	s12, s0, s9   @
+	vadd.f32	s3, s2, s1    @
+	vadd.f32	s5, s6, s14
+	vsub.f32	s4, s6, s14
+	vsub.f32	s2, s2, s1    @
+	vadd.f32	s0, s0, s9    @
+	vstr	s7, [r4]
+	vstr	s5, [r4, #4]
+	add r4, r4, #8
+	vstr	s3, [r6]
+	vstr	s12, [r6, #4]
+	add r6, r6, #8
+	vstr	s8, [r8]
+	vstr	s4, [r8, #4]
+	add r8, r8, #8
+	vstr	s2, [r10]
+	vstr	s0, [r10, #4]
+	add r10, r10, #8
+	bne _vfp_x8_loop	
+	bx lr
+	
+	
+	.align 4
+#ifdef __APPLE__
+	.globl	_vfp_end
+_vfp_end:
+#else
+	.globl	vfp_end
+vfp_end:
+#endif
+	bx lr