Windows implementation WIP

This commit is contained in:
Kearwood Gilbert
2016-07-10 03:33:58 -07:00
parent 16b731cea9
commit 3a7c1ba0eb
156 changed files with 86861 additions and 75 deletions

View File

@@ -0,0 +1,34 @@
lib_LTLIBRARIES = libffts.la
libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c
libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
if DYNAMIC_DISABLED
libffts_la_SOURCES += ffts_static.c
else
libffts_la_SOURCES += codegen.c
endif
libffts_includedir=$(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h
if HAVE_VFP
libffts_la_SOURCES += vfp.s
else
if HAVE_NEON
libffts_la_SOURCES += neon.s
if DYNAMIC_DISABLED
libffts_la_SOURCES += neon_static_f.s neon_static_i.s
endif
else
if HAVE_SSE
libffts_la_SOURCES += sse.s
endif
endif
endif

View File

@@ -0,0 +1,730 @@
# Makefile.in generated by automake 1.14 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2013 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
@HAVE_VFP_TRUE@am__append_3 = vfp.s
@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon.s
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon_static_f.s neon_static_i.s
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s
subdir = src
DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
$(top_srcdir)/depcomp $(libffts_include_HEADERS)
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
$(top_srcdir)/m4/ax_check_java_home.m4 \
$(top_srcdir)/m4/ax_java_options.m4 \
$(top_srcdir)/m4/ax_jni_include_dir.m4 \
$(top_srcdir)/m4/ax_prog_jar.m4 \
$(top_srcdir)/m4/ax_prog_javac.m4 \
$(top_srcdir)/m4/ax_prog_javac_works.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
am__vpath_adj = case $$p in \
$(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
*) f=$$p;; \
esac;
am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
am__install_max = 40
am__nobase_strip_setup = \
srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
am__nobase_strip = \
for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
am__nobase_list = $(am__nobase_strip_setup); \
for p in $$list; do echo "$$p $$p"; done | \
sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
$(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
if (++n[$$2] == $(am__install_max)) \
{ print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
END { for (dir in files) print dir, files[dir] }'
am__base_list = \
sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
am__uninstall_files_from_dir = { \
test -z "$$files" \
|| { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
|| { echo " ( cd '$$dir' && rm -f" $$files ")"; \
$(am__cd) "$$dir" && rm -f $$files; }; \
}
am__installdirs = "$(DESTDIR)$(libdir)" \
"$(DESTDIR)$(libffts_includedir)"
LTLIBRARIES = $(lib_LTLIBRARIES)
libffts_la_LIBADD =
am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \
ffts_real.c ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s neon.s \
neon_static_f.s neon_static_i.s sse.s
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon.lo
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon_static_f.lo \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@ neon_static_i.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo
am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
$(am__objects_3) $(am__objects_4) $(am__objects_5) \
$(am__objects_6)
libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__depfiles_maybe = depfiles
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
$(CCASFLAGS)
AM_V_CCAS = $(am__v_CCAS_@AM_V@)
am__v_CCAS_ = $(am__v_CCAS_@AM_DEFAULT_V@)
am__v_CCAS_0 = @echo " CCAS " $@;
am__v_CCAS_1 =
SOURCES = $(libffts_la_SOURCES)
DIST_SOURCES = $(am__libffts_la_SOURCES_DIST)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
HEADERS = $(libffts_include_HEADERS)
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
ACLOCAL = @ACLOCAL@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AWK = @AWK@
CC = @CC@
CCAS = @CCAS@
CCASDEPMODE = @CCASDEPMODE@
CCASFLAGS = @CCASFLAGS@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CXX = @CXX@
CXXCPP = @CXXCPP@
CXXDEPMODE = @CXXDEPMODE@
CXXFLAGS = @CXXFLAGS@
CYGPATH_W = @CYGPATH_W@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
JAR = @JAR@
JAVA = @JAVA@
JAVAC = @JAVAC@
JAVACFLAGS = @JAVACFLAGS@
JAVAFLAGS = @JAVAFLAGS@
JAVAPREFIX = @JAVAPREFIX@
JAVA_PATH_NAME = @JAVA_PATH_NAME@
JNI_CPPFLAGS = @JNI_CPPFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
VERSION = @VERSION@
_ACJNI_JAVAC = @_ACJNI_JAVAC@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_CXX = @ac_ct_CXX@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
lib_LTLIBRARIES = libffts.la
libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \
ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
patterns.h types.h vfp.h $(am__append_1) $(am__append_2) \
$(am__append_3) $(am__append_4) $(am__append_5) \
$(am__append_6)
libffts_includedir = $(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h
all: all-am
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj .s
$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu src/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
esac;
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
install-libLTLIBRARIES: $(lib_LTLIBRARIES)
@$(NORMAL_INSTALL)
@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
list2=; for p in $$list; do \
if test -f $$p; then \
list2="$$list2 $$p"; \
else :; fi; \
done; \
test -z "$$list2" || { \
echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
$(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
$(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
}
uninstall-libLTLIBRARIES:
@$(NORMAL_UNINSTALL)
@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
for p in $$list; do \
$(am__strip_dir) \
echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
$(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
done
clean-libLTLIBRARIES:
-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
@list='$(lib_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
.s.o:
$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $<
.s.obj:
$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.s.lo:
$(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
install-libffts_includeHEADERS: $(libffts_include_HEADERS)
@$(NORMAL_INSTALL)
@list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
if test -n "$$list"; then \
echo " $(MKDIR_P) '$(DESTDIR)$(libffts_includedir)'"; \
$(MKDIR_P) "$(DESTDIR)$(libffts_includedir)" || exit 1; \
fi; \
for p in $$list; do \
if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
echo "$$d$$p"; \
done | $(am__base_list) | \
while read files; do \
echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libffts_includedir)'"; \
$(INSTALL_HEADER) $$files "$(DESTDIR)$(libffts_includedir)" || exit $$?; \
done
uninstall-libffts_includeHEADERS:
@$(NORMAL_UNINSTALL)
@list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
dir='$(DESTDIR)$(libffts_includedir)'; $(am__uninstall_files_from_dir)
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-am
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-am
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-am
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
check-am: all-am
check: check-am
all-am: Makefile $(LTLIBRARIES) $(HEADERS)
installdirs:
for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libffts_includedir)"; do \
test -z "$$dir" || $(MKDIR_P) "$$dir"; \
done
install: install-am
install-exec: install-exec-am
install-data: install-data-am
uninstall: uninstall-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-am
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
clean: clean-am
clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
mostlyclean-am
distclean: distclean-am
-rm -rf ./$(DEPDIR)
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-am
dvi-am:
html: html-am
html-am:
info: info-am
info-am:
install-data-am: install-libffts_includeHEADERS
install-dvi: install-dvi-am
install-dvi-am:
install-exec-am: install-libLTLIBRARIES
install-html: install-html-am
install-html-am:
install-info: install-info-am
install-info-am:
install-man:
install-pdf: install-pdf-am
install-pdf-am:
install-ps: install-ps-am
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-am
-rm -rf ./$(DEPDIR)
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-am
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-am
pdf-am:
ps: ps-am
ps-am:
uninstall-am: uninstall-libLTLIBRARIES \
uninstall-libffts_includeHEADERS
.MAKE: install-am install-strip
.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
clean-libLTLIBRARIES clean-libtool cscopelist-am ctags \
ctags-am distclean distclean-compile distclean-generic \
distclean-libtool distclean-tags distdir dvi dvi-am html \
html-am info info-am install install-am install-data \
install-data-am install-dvi install-dvi-am install-exec \
install-exec-am install-html install-html-am install-info \
install-info-am install-libLTLIBRARIES \
install-libffts_includeHEADERS install-man install-pdf \
install-pdf-am install-ps install-ps-am install-strip \
installcheck installcheck-am installdirs maintainer-clean \
maintainer-clean-generic mostlyclean mostlyclean-compile \
mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
tags tags-am uninstall uninstall-am uninstall-libLTLIBRARIES \
uninstall-libffts_includeHEADERS
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

732
3rdparty/ffts/ffts-master/src/codegen.c vendored Normal file
View File

@@ -0,0 +1,732 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "codegen.h"
#include "macros.h"
#include "ffts.h"
#ifdef __APPLE__
#include <libkern/OSCacheControl.h>
#endif
#include <sys/types.h>
#include <sys/mman.h>
#ifdef HAVE_NEON
#include "codegen_arm.h"
#include "neon.h"
#elif HAVE_VFP
#include "codegen_arm.h"
#include "vfp.h"
#else
#include "codegen_sse.h"
#include "macros-sse.h"
#endif
#ifdef __ANDROID__
#include <unistd.h>
#endif
int tree_count(int N, int leafN, int offset) {
if(N <= leafN) return 0;
int count = 0;
count += tree_count(N/4, leafN, offset);
count += tree_count(N/8, leafN, offset + N/4);
count += tree_count(N/8, leafN, offset + N/4 + N/8);
count += tree_count(N/4, leafN, offset + N/2);
count += tree_count(N/4, leafN, offset + 3*N/4);
return 1 + count;
}
void elaborate_tree(size_t **p, int N, int leafN, int offset) {
if(N <= leafN) return;
elaborate_tree(p, N/4, leafN, offset);
elaborate_tree(p, N/8, leafN, offset + N/4);
elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
elaborate_tree(p, N/4, leafN, offset + N/2);
elaborate_tree(p, N/4, leafN, offset + 3*N/4);
(*p)[0] = N;
(*p)[1] = offset*2;
(*p)+=2;
}
uint32_t LUT_offset(size_t N, size_t leafN) {
int i;
size_t p_lut_size = 0;
size_t lut_size = 0;
int hardcoded = 0;
size_t n_luts = __builtin_ctzl(N/leafN);
int n = leafN*2;
//if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
for(i=0;i<n_luts-1;i++) {
p_lut_size = lut_size;
if(!i || hardcoded) {
#ifdef __arm__
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
else lut_size += n/4 * sizeof(cdata_t);
#else
lut_size += n/4 * 2 * sizeof(cdata_t);
#endif
// n *= 2;
} else {
#ifdef __arm__
lut_size += n/8 * 3 * sizeof(cdata_t);
#else
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
#endif
}
n *= 2;
}
return lut_size;
}
#ifdef __arm__
typedef uint32_t insns_t;
#else
typedef uint8_t insns_t;
#endif
#define P(x) (*(*p)++ = x)
void insert_nops(uint8_t **p, uint32_t count) {
switch(count) {
case 0: break;
case 2: P(0x66);
case 1: P(0x90); break;
case 3: P(0x0F); P(0x1F); P(0x00); break;
case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;
case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;
case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
default:
P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00);
insert_nops(p, count-9);
break;
}
}
void align_mem16(uint8_t **p, uint32_t offset) {
#ifdef __x86_64__
int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
r = (16 + r) & 0xf;
insert_nops(p, r);
#endif
}
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
int count = tree_count(N, leafN, 0) + 1;
size_t *ps = malloc(count * 2 * sizeof(size_t));
size_t *pps = ps;
#ifdef __x86_64__
if(sign < 0) p->constants = sse_constants;
else p->constants = sse_constants_inv;
#endif
elaborate_tree(&pps, N, leafN, 0);
pps[0] = 0;
pps[1] = 0;
pps = ps;
#ifdef __arm__
if(N < 8192) p->transform_size = 8192;
else p->transform_size = N;
#else
if(N < 2048) p->transform_size = 16384;
else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
#endif
#ifdef __APPLE__
p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
#else
#define MAP_ANONYMOUS 0x20
p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
#endif
/*
if(p->transform_base == MAP_FAILED) {
fprintf(stderr, "MAP FAILED\n");
exit(1);
}*/
insns_t *func = p->transform_base;//valloc(8192);
insns_t *fp = func;
//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
//fprintf(stderr, "Base address = %016p\n", func);
if(!func) {
fprintf(stderr, "NOMEM\n");
exit(1);
}
insns_t *x_8_addr = fp;
#ifdef __arm__
#ifdef HAVE_NEON
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
/*
* Changes adds to subtracts and vice versa to allow the computation
* of both the IFFT and FFT
*/
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
}
fp += (neon_x8_t - neon_x8) / 4;
#else
memcpy(fp, vfp_x8, vfp_end - vfp_x8);
if(sign > 0) {
fp[65] ^= 0x00000040;
fp[66] ^= 0x00000040;
fp[68] ^= 0x00000040;
fp[70] ^= 0x00000040;
fp[103] ^= 0x00000040;
fp[104] ^= 0x00000040;
fp[105] ^= 0x00000040;
fp[108] ^= 0x00000040;
fp[113] ^= 0x00000040;
fp[114] ^= 0x00000040;
fp[117] ^= 0x00000040;
fp[118] ^= 0x00000040;
}
fp += (vfp_end - vfp_x8) / 4;
#endif
#else
align_mem16(&fp, 0);
x_8_addr = fp;
align_mem16(&fp, 5);
memcpy(fp, x8_soft, x8_hard - x8_soft);
fp += (x8_hard - x8_soft);
//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
#endif
//uint32_t *x_8_t_addr = fp;
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
insns_t *x_4_addr = fp;
#ifdef __arm__
#ifdef HAVE_NEON
memcpy(fp, neon_x4, neon_x8 - neon_x4);
if(sign < 0) {
fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
}
fp += (neon_x8 - neon_x4) / 4;
#else
memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
if(sign > 0) {
fp[36] ^= 0x00000040;
fp[38] ^= 0x00000040;
fp[43] ^= 0x00000040;
fp[44] ^= 0x00000040;
}
fp += (vfp_x8 - vfp_x4) / 4;
#endif
#else
align_mem16(&fp, 0);
x_4_addr = fp;
memcpy(fp, x4, x8_soft - x4);
fp += (x8_soft - x4);
#endif
insns_t *start = fp;
#ifdef __arm__
*fp = PUSH_LR(); fp++;
*fp = 0xed2d8b10; fp++;
ADDI(&fp, 3, 1, 0);
ADDI(&fp, 7, 1, N);
ADDI(&fp, 5, 1, 2*N);
ADDI(&fp, 10, 7, 2*N);
ADDI(&fp, 4, 5, 2*N);
ADDI(&fp, 8, 10, 2*N);
ADDI(&fp, 6, 4, 2*N);
ADDI(&fp, 9, 8, 2*N);
*fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
// *fp++ = LDRI(1, 0, 4); // load ws into r1
ADDI(&fp, 1, 0, 0);
ADDI(&fp, 0, 2, 0), // mov out into r0
#endif
#ifdef __arm__
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
#ifdef HAVE_NEON
MOVI(&fp, 11, p->i0);
#else
MOVI(&fp, 11, p->i0);
#endif
#else
align_mem16(&fp, 0);
start = fp;
*fp++ = 0x4c;
*fp++ = 0x8b;
*fp++ = 0x07;
uint32_t lp_cnt = p->i0 * 4;
MOVI(&fp, RCX, lp_cnt);
//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
#endif
//fp++;
#ifdef __arm__
#ifdef HAVE_NEON
memcpy(fp, neon_ee, neon_oo - neon_ee);
if(sign < 0) {
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
}
fp += (neon_oo - neon_ee) / 4;
#else
memcpy(fp, vfp_e, vfp_o - vfp_e);
if(sign > 0) {
fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
}
fp += (vfp_o - vfp_e) / 4;
#endif
#else
//fprintf(stderr, "Body start address = %016p\n", start);
PUSH(&fp, RBP);
PUSH(&fp, RBX);
PUSH(&fp, R10);
PUSH(&fp, R11);
PUSH(&fp, R12);
PUSH(&fp, R13);
PUSH(&fp, R14);
PUSH(&fp, R15);
int i;
memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
//fprintf(stderr, "Constants address = %016p\n", sse_constants);
//fprintf(stderr, "Constants address = %016p\n", p->constants);
//int32_t val = READ_IMM32(fp + 3);
//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
//fprintf(stderr, "IMM = 0x%llx\n", v2);
//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp ));
fp += (leaf_ee - leaf_ee_init);
//fprintf(stderr, "Leaf start address = %016p\n", fp);
align_mem16(&fp, 9);
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4);
fp += (leaf_oo - leaf_ee);
if(__builtin_ctzl(N) & 1){
if(p->i1) {
lp_cnt += p->i1 * 4;
MOVI(&fp, RCX, lp_cnt);
align_mem16(&fp, 4);
memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
fp += (leaf_eo - leaf_oo);
}
memcpy(fp, leaf_oe, leaf_end - leaf_oe);
lp_cnt += 4;
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4);
fp += (leaf_end - leaf_oe);
}else{
memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
lp_cnt += 4;
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4);
fp += (leaf_oe - leaf_eo);
if(p->i1) {
lp_cnt += p->i1 * 4;
MOVI(&fp, RCX, lp_cnt);
align_mem16(&fp, 4);
memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
fp += (leaf_eo - leaf_oo);
}
}
if(p->i1) {
lp_cnt += p->i1 * 4;
MOVI(&fp, RCX, lp_cnt);
align_mem16(&fp, 9);
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4);
fp += (leaf_oo - leaf_ee);
}
//fprintf(stderr, "Body start address = %016p\n", fp);
//LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
memcpy(fp, x_init, x4 - x_init);
//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
fp += (x4 - x_init);
int32_t pAddr = 0;
int32_t pN = 0;
int32_t pLUT = 0;
count = 2;
while(pps[0]) {
if(!pN) {
MOVI(&fp, RCX, pps[0] / 4);
}else{
if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
if(pps[0] > leafN && pps[0] - pN) {
int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
*fp++ = 0xc1;
if(diff > 0) {
*fp++ = 0xe1;
*fp++ = (diff & 0xff);
}else{
*fp++ = 0xe9;
*fp++ = ((-diff) & 0xff);
}
}
}
if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
if(pps[0] == 2*leafN) {
CALL(&fp, x_4_addr);
// }else if(!pps[2]){
// //uint32_t *x_8_t_addr = fp;
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
// fp += (neon_ee - neon_x8_t) / 4;
// //*fp++ = BL(fp+2, x_8_t_addr);
}else{
CALL(&fp, x_8_addr);
}
pAddr = pps[1] * 4;
if(pps[0] > leafN)
pN = pps[0];
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
count += 4;
pps += 2;
}
#endif
#ifdef __arm__
#ifdef HAVE_NEON
if(__builtin_ctzl(N) & 1){
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
ADDI(&fp, 9, 2, 0);
ADDI(&fp, 2, 8, 0);
ADDI(&fp, 8, 10, 0);
ADDI(&fp, 10, 2, 0);
if(p->i1) {
MOVI(&fp, 11, p->i1);
memcpy(fp, neon_oo, neon_eo - neon_oo);
if(sign < 0) {
fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
}
fp += (neon_eo - neon_oo) / 4;
}
*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++;
memcpy(fp, neon_oe, neon_end - neon_oe);
if(sign < 0) {
fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
}
fp += (neon_end - neon_oe) / 4;
}else{
*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++;
memcpy(fp, neon_eo, neon_oe - neon_eo);
if(sign < 0) {
fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
}
fp += (neon_oe - neon_eo) / 4;
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
ADDI(&fp, 9, 2, 0);
ADDI(&fp, 2, 8, 0);
ADDI(&fp, 8, 10, 0);
ADDI(&fp, 10, 2, 0);
if(p->i1) {
MOVI(&fp, 11, p->i1);
memcpy(fp, neon_oo, neon_eo - neon_oo);
if(sign < 0) {
fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
}
fp += (neon_eo - neon_oo) / 4;
}
}
if(p->i1) {
ADDI(&fp, 2, 3, 0);
ADDI(&fp, 3, 7, 0);
ADDI(&fp, 7, 2, 0);
ADDI(&fp, 2, 4, 0);
ADDI(&fp, 4, 8, 0);
ADDI(&fp, 8, 2, 0);
ADDI(&fp, 2, 5, 0);
ADDI(&fp, 5, 9, 0);
ADDI(&fp, 9, 2, 0);
ADDI(&fp, 2, 6, 0);
ADDI(&fp, 6, 10, 0);
ADDI(&fp, 10, 2, 0);
ADDI(&fp, 2, 9, 0);
ADDI(&fp, 9, 10, 0);
ADDI(&fp, 10, 2, 0);
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
MOVI(&fp, 11, p->i1);
memcpy(fp, neon_ee, neon_oo - neon_ee);
if(sign < 0) {
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
}
fp += (neon_oo - neon_ee) / 4;
}
#else
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
ADDI(&fp, 9, 2, 0);
ADDI(&fp, 2, 8, 0);
ADDI(&fp, 8, 10, 0);
ADDI(&fp, 10, 2, 0);
MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
memcpy(fp, vfp_o, vfp_x4 - vfp_o);
if(sign > 0) {
fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
}
fp += (vfp_x4 - vfp_o) / 4;
ADDI(&fp, 2, 3, 0);
ADDI(&fp, 3, 7, 0);
ADDI(&fp, 7, 2, 0);
ADDI(&fp, 2, 4, 0);
ADDI(&fp, 4, 8, 0);
ADDI(&fp, 8, 2, 0);
ADDI(&fp, 2, 5, 0);
ADDI(&fp, 5, 9, 0);
ADDI(&fp, 9, 2, 0);
ADDI(&fp, 2, 6, 0);
ADDI(&fp, 6, 10, 0);
ADDI(&fp, 10, 2, 0);
ADDI(&fp, 2, 9, 0);
ADDI(&fp, 9, 10, 0);
ADDI(&fp, 10, 2, 0);
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
memcpy(fp, vfp_e, vfp_o - vfp_e);
if(sign > 0) {
fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
}
fp += (vfp_o - vfp_e) / 4;
#endif
*fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
//ADDI(&fp, 2, 1, 0);
MOVI(&fp, 1, 0);
// args: r0 - out
// r1 - N
// r2 - ws
// ADDI(&fp, 3, 1, 0); // put N into r3 for counter
int32_t pAddr = 0;
int32_t pN = 0;
int32_t pLUT = 0;
count = 2;
while(pps[0]) {
// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
if(!pN) {
MOVI(&fp, 1, pps[0]);
}else{
if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
}
if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
if(pps[0] == 2*leafN) {
*fp = BL(fp+2, x_4_addr); fp++;
}else if(!pps[2]){
//uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
}
fp += (neon_ee - neon_x8_t) / 4;
//*fp++ = BL(fp+2, x_8_t_addr);
#else
*fp = BL(fp+2, x_8_addr); fp++;
#endif
}else{
*fp = BL(fp+2, x_8_addr); fp++;
}
pAddr = pps[1] * 4;
pN = pps[0];
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
count += 4;
pps += 2;
}
*fp++ = 0xecbd8b10;
*fp++ = POP_LR(); count++;
#else
POP(&fp, R15);
POP(&fp, R14);
POP(&fp, R13);
POP(&fp, R12);
POP(&fp, R11);
POP(&fp, R10);
POP(&fp, RBX);
POP(&fp, RBP);
RET(&fp);
//uint8_t *pp = func;
//int counter = 0;
//do{
// printf("%02x ", *pp);
// if(counter++ % 16 == 15) printf("\n");
//} while(++pp < fp);
//printf("\n");
#endif
// *fp++ = B(14); count++;
//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
// fprintf(stderr, "%08x\n", x_4_addr[i]);
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++)
free(ps);
if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
perror("Couldn't mprotect");
exit(1);
}
#ifdef __APPLE__
sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
#ifdef __GNUC__
__clear_cache((long)(func), (long)(func) + p->transform_size);
#endif
#endif
//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
p->transform = (void *) (start);
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

50
3rdparty/ffts/ffts-master/src/codegen.h vendored Normal file
View File

@@ -0,0 +1,50 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CODEGEN_H__
#define __CODEGEN_H__
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/mman.h>
#include <string.h>
#include <limits.h> /* for PAGESIZE */
#include "ffts.h"
void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,102 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CODEGEN_ARM_H__
#define __CODEGEN_ARM_H__
uint32_t BL(void *pos, void *target) {
return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
}
uint32_t B(uint8_t r) {
return 0xe12fff10 | r;
}
uint32_t MOV(uint8_t dst, uint8_t src) {
return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
}
void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
int32_t oimm = imm;
if(imm < 0) {
imm = -imm;
uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
if(shamt & 1) shamt -= 1;
imm >>= shamt;
shamt = (32 - shamt)/2;
// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
*(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
}else{
uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
if(shamt & 1) shamt -= 1;
imm >>= shamt;
shamt = (32 - shamt)/2;
// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
*(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
}
}
uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
return 0xe5900000 | ((dst & 0xf) << 12)
| ((base & 0xf) << 16) | (offset & 0xfff) ;
}
void MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
uint32_t oimm = imm;
uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
if(shamt & 1) shamt -= 1;
imm >>= shamt;
shamt = (32 - shamt)/2;
*(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
}
uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,196 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CODEGEN_SSE_H__
#define __CODEGEN_SSE_H__
void neon_x4(float *, size_t, float *);
void neon_x8(float *, size_t, float *);
void neon_x8_t(float *, size_t, float *);
void leaf_ee_init();
void leaf_ee();
void leaf_oo();
void leaf_eo();
void leaf_oe();
void leaf_end();
void x_init();
void x4();
void x8_soft();
void x8_hard();
void sse_constants();
void sse_constants_inv();
// typedef uint8_t insns_t;
extern const uint32_t sse_leaf_ee_offsets[8];
extern const uint32_t sse_leaf_oo_offsets[8];
extern const uint32_t sse_leaf_eo_offsets[8];
extern const uint32_t sse_leaf_oe_offsets[8];
#define EAX 0
#define ECX 1
#define EDX 2
#define EBX 3
#define ESI 6
#define EDI 7
#define EBP 5
#define RAX 0
#define RCX 1
#define RDX 2
#define RBX 3
#define RSI 6
#define RDI 7
#define RBP 5
#define R8 8
#define R9 9
#define R10 10
#define R11 11
#define R12 12
#define R13 13
#define R14 14
#define R15 15
void IMM8(uint8_t **p, int32_t imm) {
*(*p)++ = (imm & 0xff);
}
void IMM16(uint8_t **p, int32_t imm) {
int i;
for(i=0;i<2;i++) {
*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
}
}
void IMM32(uint8_t **p, int32_t imm) {
int i;
for(i=0;i<4;i++) {
*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
}
}
void IMM32_NI(uint8_t *p, int32_t imm) {
int i;
for(i=0;i<4;i++) {
*(p+i) = (imm & (0xff << (i*8))) >> (i*8);
}
}
int32_t READ_IMM32(uint8_t *p) {
int32_t rval = 0;
int i;
for(i=0;i<4;i++) {
rval |= *(p+i) << (i*8);
}
return rval;
}
void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
// if(imm < 65536) *(*p)++ = 0x66;
if(dst >= 8) *(*p)++ = 0x41;
//if(imm < 65536 && imm >= 256) *(*p)++ = 0x66;
//if(imm >= 256)
*(*p)++ = 0xb8 | (dst & 0x7);
// else *(*p)++ = 0xb0 | (dst & 0x7);
// if(imm < 256) IMM8(p, imm);
// else
//if(imm < 65536) IMM16(p, imm);
//else
IMM32(p, imm);
//if(dst < 8) {
// *(*p)++ = 0xb8 + dst;
//}else{
// *(*p)++ = 0x49;
// *(*p)++ = 0xc7;
// *(*p)++ = 0xc0 | (dst - 8);
//}
//IMM32(p, imm);
}
void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
if(disp == 0) {
*(*p)++ = (rm & 7) | ((reg & 7) << 3);
}else if(disp <= 127 || disp >= -128) {
*(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
IMM8(p, disp);
}else{
*(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
IMM32(p, disp);
}
}
void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
*(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
*(*p)++ = 0x8d;
ADDRMODE(p, dst, base, disp);
}
void RET(uint8_t **p) {
*(*p)++ = 0xc3;
}
void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
if(dst >= 8) *(*p)++ = 0x49;
else *(*p)++ = 0x48;
if(imm > 127 || imm <= -128) *(*p)++ = 0x81;
else *(*p)++ = 0x83;
*(*p)++ = 0xc0 | (dst & 0x7);
if(imm > 127 || imm <= -128) IMM32(p, imm);
else IMM8(p, imm);
}
void CALL(uint8_t **p, uint8_t *func) {
*(*p)++ = 0xe8;
IMM32(p, ((void *)func) - (void *)(*p) - 4);
}
void PUSH(uint8_t **p, uint8_t reg) {
if(reg >= 8) *(*p)++ = 0x41;
*(*p)++ = 0x50 | (reg & 7);
}
void POP(uint8_t **p, uint8_t reg) {
if(reg >= 8) *(*p)++ = 0x41;
*(*p)++ = 0x58 | (reg & 7);
}
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

416
3rdparty/ffts/ffts-master/src/ffts.c vendored Normal file
View File

@@ -0,0 +1,416 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts.h"
#include "macros.h"
//#include "mini_macros.h"
#include "patterns.h"
#include "ffts_small.h"
#ifdef DYNAMIC_DISABLED
#include "ffts_static.h"
#else
#include "codegen.h"
#endif
#include <errno.h>
#include <sys/mman.h>
#include <string.h>
#include <limits.h> /* for PAGESIZE */
#if __APPLE__
#include <libkern/OSCacheControl.h>
#else
#endif
void ffts_execute(ffts_plan_t *p, const void * in, void * out) {
//TODO: Define NEEDS_ALIGNED properly instead
#if defined(HAVE_SSE) || defined(HAVE_NEON)
if(((int)in % 16) != 0) {
LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
}
if(((int)out % 16) != 0) {
LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
}
#endif
p->transform(p, (const float *)in, (float *)out);
}
void ffts_free(ffts_plan_t *p) {
p->destroy(p);
}
void ffts_free_1d(ffts_plan_t *p) {
size_t i;
if(p->ws) {
FFTS_FREE(p->ws);
}
if(p->is) free(p->is);
if(p->ws_is) free(p->ws_is);
if(p->offsets) free(p->offsets);
//free(p->transforms);
if(p->transforms) free(p->transforms);
if(p->transform_base) {
if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
perror("Couldn't mprotect");
exit(errno);
}
munmap(p->transform_base, p->transform_size);
//free(p->transform_base);
}
free(p);
}
ffts_plan_t *ffts_init_1d(size_t N, int sign) {
if(N == 0 || (N & (N - 1)) != 0){
LOG("FFT size must be a power of two\n");
return NULL;
}
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
size_t leafN = 8;
size_t i;
#ifdef __arm__
//#ifdef HAVE_NEON
V MULI_SIGN;
if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
//#endif
#else
V MULI_SIGN;
if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
#endif
p->transform = NULL;
p->transform_base = NULL;
p->transforms = NULL;
p->is = NULL;
p->ws_is = NULL;
p->ws = NULL;
p->offsets = NULL;
p->destroy = ffts_free_1d;
if(N >= 32) {
ffts_init_offsets(p, N, leafN);
#ifdef __arm__
#ifdef HAVE_NEON
ffts_init_is(p, N, leafN, 1);
#else
ffts_init_is(p, N, leafN, 1);
#endif
#else
ffts_init_is(p, N, leafN, 1);
#endif
p->i0 = N/leafN/3+1;
p->i1 = N/leafN/3;
if((N/leafN) % 3 > 1) p->i1++;
p->i2 = N/leafN/3;
#ifdef __arm__
#ifdef HAVE_NEON
p->i0/=2;
p->i1/=2;
#endif
#else
p->i0/=2;
p->i1/=2;
#endif
}else{
p->transforms = malloc(2 * sizeof(transform_index_t));
p->transforms[0] = 0;
p->transforms[1] = 1;
if(N == 2) p->transform = &firstpass_2;
else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
p->is = NULL;
p->offsets = NULL;
}
int hardcoded = 0;
/* LUTS */
size_t n_luts = __builtin_ctzl(N/leafN);
if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
if(n_luts >= 32) n_luts = 0;
// fprintf(stderr, "n_luts = %zu\n", n_luts);
cdata_t *w;
int n = leafN*2;
if(hardcoded) n = 8;
size_t lut_size = 0;
for(i=0;i<n_luts;i++) {
if(!i || hardcoded) {
#ifdef __arm__
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
else lut_size += n/4 * sizeof(cdata_t);
#else
lut_size += n/4 * 2 * sizeof(cdata_t);
#endif
n *= 2;
} else {
#ifdef __arm__
lut_size += n/8 * 3 * sizeof(cdata_t);
#else
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
#endif
}
n *= 2;
}
// lut_size *= 16;
// fprintf(stderr, "lut size = %zu\n", lut_size);
if(n_luts) {
p->ws = FFTS_MALLOC(lut_size,32);
p->ws_is = malloc(n_luts * sizeof(size_t));
}else{
p->ws = NULL;
p->ws_is = NULL;
}
w = p->ws;
n = leafN*2;
if(hardcoded) n = 8;
#ifdef HAVE_NEON
V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
#endif
for(i=0;i<n_luts;i++) {
p->ws_is[i] = w - (cdata_t *)p->ws;
//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
if(!i || hardcoded) {
cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
size_t j;
for(j=0;j<n/4;j++) {
w0[j][0] = W_re(n,j);
w0[j][1] = W_im(n,j);
}
float *fw0 = (float *)w0;
#ifdef __arm__
if(N < 32) {
//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
float *fw = (float *)w;
V temp0, temp1, temp2;
for(j=0;j<n/4;j+=2) {
// #ifdef HAVE_NEON
temp0 = VLD(fw0 + j*2);
V re, im;
re = VDUPRE(temp0);
im = VDUPIM(temp0);
#ifdef HAVE_NEON
im = VXOR(im, MULI_SIGN);
//im = IMULI(sign>0, im);
#else
im = MULI(sign>0, im);
#endif
VST(fw + j*4 , re);
VST(fw + j*4+4, im);
// #endif
}
w += n/4 * 2;
}else{
//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
float *fw = (float *)w;
#ifdef HAVE_NEON
VS temp0, temp1, temp2;
for(j=0;j<n/4;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2, temp0);
}
#else
for(j=0;j<n/4;j+=1) {
fw[j*2] = fw0[j*2];
fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
}
#endif
w += n/4;
}
#else
//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
float *fw = (float *)w;
V temp0, temp1, temp2;
for(j=0;j<n/4;j+=2) {
temp0 = VLD(fw0 + j*2);
V re, im;
re = VDUPRE(temp0);
im = VDUPIM(temp0);
im = VXOR(im, MULI_SIGN);
VST(fw + j*4 , re);
VST(fw + j*4+4, im);
}
w += n/4 * 2;
#endif
FFTS_FREE(w0);
}else{
cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
size_t j;
for(j=0;j<n/8;j++) {
w0[j][0] = W_re(n,j*2);
w0[j][1] = W_im(n,j*2);
w1[j][0] = W_re(n,j);
w1[j][1] = W_im(n,j);
w2[j][0] = W_re(n,j + (n/8));
w2[j][1] = W_im(n,j + (n/8));
}
float *fw0 = (float *)w0;
float *fw1 = (float *)w1;
float *fw2 = (float *)w2;
#ifdef __arm__
//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
float *fw = (float *)w;
#ifdef HAVE_NEON
VS temp0, temp1, temp2;
for(j=0;j<n/8;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2*3, temp0);
temp1 = VLD2(fw1 + j*2);
temp1.val[1] = VXOR(temp1.val[1], neg);
STORESPR(fw + j*2*3 + 8, temp1);
temp2 = VLD2(fw2 + j*2);
temp2.val[1] = VXOR(temp2.val[1], neg);
STORESPR(fw + j*2*3 + 16, temp2);
}
#else
for(j=0;j<n/8;j+=1) {
fw[j*6] = fw0[j*2];
fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
fw[j*6+2] = fw1[j*2+0];
fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
fw[j*6+4] = fw2[j*2+0];
fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
}
#endif
w += n/8 * 3;
#else
//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
float *fw = (float *)w;
V temp0, temp1, temp2, re, im;
for(j=0;j<n/8;j+=2) {
temp0 = VLD(fw0 + j*2);
re = VDUPRE(temp0);
im = VDUPIM(temp0);
im = VXOR(im, MULI_SIGN);
VST(fw + j*2*6 , re);
VST(fw + j*2*6+4, im);
temp1 = VLD(fw1 + j*2);
re = VDUPRE(temp1);
im = VDUPIM(temp1);
im = VXOR(im, MULI_SIGN);
VST(fw + j*2*6+8 , re);
VST(fw + j*2*6+12, im);
temp2 = VLD(fw2 + j*2);
re = VDUPRE(temp2);
im = VDUPIM(temp2);
im = VXOR(im, MULI_SIGN);
VST(fw + j*2*6+16, re);
VST(fw + j*2*6+20, im);
}
w += n/8 * 3 * 2;
#endif
FFTS_FREE(w0);
FFTS_FREE(w1);
FFTS_FREE(w2);
}
///p->ws[i] = w;
n *= 2;
}
float *tmp = (float *)p->ws;
if(sign < 0) {
p->oe_ws = (void *)(&w_data[4]);
p->ee_ws = (void *)(w_data);
p->eo_ws = (void *)(&w_data[4]);
}else{
p->oe_ws = (void *)(w_data + 12);
p->ee_ws = (void *)(w_data + 8);
p->eo_ws = (void *)(w_data + 12);
}
p->N = N;
p->lastlut = w;
p->n_luts = n_luts;
#ifdef DYNAMIC_DISABLED
if(sign < 0) {
if(N >= 32) p->transform = ffts_static_transform_f;
}else{
if(N >= 32) p->transform = ffts_static_transform_i;
}
#else
if(N>=32) ffts_generate_func_code(p, N, leafN, sign);
#endif
return p;
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

186
3rdparty/ffts/ffts-master/src/ffts.h vendored Normal file
View File

@@ -0,0 +1,186 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CP_SSE_H__
#define __CP_SSE_H__
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>
//#include <stdalign.h>
//#include "codegen.h"
#include "types.h"
#ifdef __ANDROID__
#include <android/log.h>
#define LOG(s) __android_log_print(ANDROID_LOG_ERROR, "FFTS", s)
#else
#define LOG(s) fprintf(stderr, s)
#endif
#define PI 3.1415926535897932384626433832795028841971693993751058209
static const __attribute__ ((aligned(64))) float w_data[16] = {
0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
-0.70710678118654757273731092936941, -0.70710678118654746171500846685376,
1.0f, 0.70710678118654757273731092936941f,
-0.0f, -0.70710678118654746171500846685376,
0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
1.0f, 0.70710678118654757273731092936941f,
0.0f, 0.70710678118654746171500846685376
};
__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
typedef size_t transform_index_t;
//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
typedef struct _ffts_plan_t ffts_plan_t;
/**
* Contains all the Information need to perform FFT
*
*
* DO NOT CHANGE THE ORDER OF MEMBERS
* ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE
* SOME OF THESE VARIABES!!
*/
struct _ffts_plan_t {
/**
*
*/
ptrdiff_t *offsets;
#ifdef DYNAMIC_DISABLED
/**
* Twiddle factors
*/
void *ws;
/**
* ee - 2 size x size8
* oo - 2 x size4 in parallel
* oe -
*/
void *oe_ws, *eo_ws, *ee_ws;
#else
void __attribute__((aligned(32))) *ws;
void __attribute__((aligned(32))) *oe_ws, *eo_ws, *ee_ws;
#endif
/**
* Pointer into an array of precomputed indexes for the input data array
*/
ptrdiff_t *is;
/**
* Twiddle Factor Indexes
*/
size_t *ws_is;
/**
* Size of the loops for the base cases
*/
size_t i0, i1, n_luts;
/**
* Size fo the Transform
*/
size_t N;
void *lastlut;
/**
* Used in multidimensional Code ??
*/
transform_index_t *transforms;
//transform_func_t transform;
/**
* Pointer to the dynamically generated function
* that will execute the FFT
*/
void (*transform)(ffts_plan_t * , const void * , void * );
/**
* Pointer to the base memory address of
* of the transform function
*/
void *transform_base;
/**
* Size of the memory block contain the
* generated code
*/
size_t transform_size;
/**
* Points to the cosnant variables used by
* the Assembly Code
*/
void *constants;
// multi-dimensional stuff:
struct _ffts_plan_t **plans;
int rank;
size_t *Ns, *Ms;
void *buf;
void *transpose_buf;
/**
* Pointer to the destroy function
* to clean up the plan after use
* (differs for real and multi dimension transforms
*/
void (*destroy)(ffts_plan_t *);
/**
* Coefficiants for the real valued transforms
*/
float *A, *B;
size_t i2;
};
void ffts_free(ffts_plan_t *);
ffts_plan_t *ffts_init_1d(size_t N, int sign);
void ffts_execute(ffts_plan_t *, const void *, void *);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

314
3rdparty/ffts/ffts-master/src/ffts_nd.c vendored Normal file
View File

@@ -0,0 +1,314 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_nd.h"
#ifdef HAVE_NEON
#include "neon.h"
#endif
void ffts_free_nd(ffts_plan_t *p) {
int i;
for(i=0;i<p->rank;i++) {
ffts_plan_t *x = p->plans[i];
int k;
for(k=0;k<i;k++) {
if(p->Ms[i] == p->Ms[k]) x = NULL;
}
if(x) ffts_free(x);
}
free(p->Ns);
free(p->Ms);
free(p->plans);
free(p->buf);
free(p->transpose_buf);
free(p);
}
#define TSIZE 8
#include <string.h>
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
#ifdef HAVE_NEON
size_t i,j,k;
int linebytes = w*8;
for(j=0;j<h;j+=8) {
for(i=0;i<w;i+=8) {
neon_transpose_to_buf(in + j*w + i, buf, w);
uint64_t *p = out + i*h + j;
uint64_t *pbuf = buf;
uint64_t *ptemp;
#if defined(__aarch64__) || defined(__arm64__)
// This particular function comes out nicely using arm64 intrinsics; no need to deal with inline asm
{
uint64x2_t q8,q9,q10,q11,q12,q13,q14,q15;
int x;
for (x=0; x<4; x++)
{
ptemp = p;
p += w;
q8 = vld1q_u64(&pbuf[0]);
q9 = vld1q_u64(&pbuf[2]);
q10 = vld1q_u64(&pbuf[4]);
q11 = vld1q_u64(&pbuf[6]);
q12 = vld1q_u64(&pbuf[8]);
q13 = vld1q_u64(&pbuf[10]);
q14 = vld1q_u64(&pbuf[12]);
q15 = vld1q_u64(&pbuf[14]);
pbuf += 16;
vst1q_u64(&ptemp[0], q8);
vst1q_u64(&ptemp[2], q9);
vst1q_u64(&ptemp[4], q10);
vst1q_u64(&ptemp[6], q11);
ptemp = p;
p += w;
vst1q_u64(&ptemp[0], q12);
vst1q_u64(&ptemp[2], q13);
vst1q_u64(&ptemp[4], q14);
vst1q_u64(&ptemp[6], q15);
} // for x
} // aarch64
#else
__asm__ __volatile__(
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"add %[p], %[p], %[w], lsl #3\n\t"
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
"mov %[ptemp], %[p]\n\t"
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
: [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
: [w] "r" (w)
: "memory", "q8", "q9", "q10", "q11"
);
#endif // 32 vs 64-bit version
// out[i*h + j] = in[j*w + i];
}
}
#else
#ifdef HAVE_SSE
uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
int tx, ty;
int x, y;
int tw = w / TSIZE;
int th = h / TSIZE;
for (ty=0;ty<th;ty++) {
for (tx=0;tx<tw;tx++) {
uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE;
// Copy/transpose to tmp
for (y=0;y<TSIZE;y+=2) {
//for (x=0;x<TSIZE;x+=2) {
//op[x*TSIZE] = ip[x];
__m128d q0 = _mm_load_pd((double *)(ip0 + 0*w));
__m128d q1 = _mm_load_pd((double *)(ip0 + 1*w));
__m128d q2 = _mm_load_pd((double *)(ip0 + 2*w));
__m128d q3 = _mm_load_pd((double *)(ip0 + 3*w));
__m128d q4 = _mm_load_pd((double *)(ip0 + 4*w));
__m128d q5 = _mm_load_pd((double *)(ip0 + 5*w));
__m128d q6 = _mm_load_pd((double *)(ip0 + 6*w));
__m128d q7 = _mm_load_pd((double *)(ip0 + 7*w));
ip0 += 2;
__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
__m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
__m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
__m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
__m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
__m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
__m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
//_mm_store_pd((double *)(op0 + y*h + x), t0);
//_mm_store_pd((double *)(op0 + y*h + x + h), t1);
_mm_store_pd((double *)(op0 + 0), t0);
_mm_store_pd((double *)(op0 + 0 + TSIZE), t1);
_mm_store_pd((double *)(op0 + 2 ), t2);
_mm_store_pd((double *)(op0 + 2 + TSIZE), t3);
_mm_store_pd((double *)(op0 + 4 ), t4);
_mm_store_pd((double *)(op0 + 4 + TSIZE), t5);
_mm_store_pd((double *)(op0 + 6 ), t6);
_mm_store_pd((double *)(op0 + 6 + TSIZE), t7);
//}
op0 += 2*TSIZE;
}
op0 = out + h*tx*TSIZE + ty*TSIZE;
ip0 = tmp;
for (y=0;y<TSIZE;y+=1) {
// memcpy(op0, ip0, TSIZE * sizeof(*ip0));
__m128d q0 = _mm_load_pd((double *)(ip0 + 0));
__m128d q1 = _mm_load_pd((double *)(ip0 + 2));
__m128d q2 = _mm_load_pd((double *)(ip0 + 4));
__m128d q3 = _mm_load_pd((double *)(ip0 + 6));
_mm_store_pd((double *)(op0 + 0), q0);
_mm_store_pd((double *)(op0 + 2), q1);
_mm_store_pd((double *)(op0 + 4), q2);
_mm_store_pd((double *)(op0 + 6), q3);
op0 += h;
ip0 += TSIZE;
}
}
}
/*
size_t i,j;
for(i=0;i<w;i+=2) {
for(j=0;j<h;j+=2) {
// out[i*h + j] = in[j*w + i];
__m128d q0 = _mm_load_pd((double *)(in + j*w + i));
__m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
_mm_store_pd((double *)(out + i*h + j), t0);
_mm_store_pd((double *)(out + i*h + j + h), t1);
}
}
*/
#endif
#endif
}
void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out) {
uint64_t *din = (uint64_t *)in;
uint64_t *buf = p->buf;
uint64_t *dout = (uint64_t *)out;
size_t i,j;
for(i=0;i<p->Ns[0];i++) {
p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));
}
ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);
for(i=1;i<p->rank;i++) {
for(j=0;j<p->Ns[i];j++) {
p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
}
ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
}
}
ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) {
size_t vol = 1;
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
p->transform = &ffts_execute_nd;
p->destroy = &ffts_free_nd;
p->rank = rank;
p->Ns = malloc(sizeof(size_t) * rank);
p->Ms = malloc(sizeof(size_t) * rank);
p->plans = malloc(sizeof(ffts_plan_t **) * rank);
int i;
for(i=0;i<rank;i++) {
p->Ns[i] = Ns[i];
vol *= Ns[i];
}
p->buf = valloc(sizeof(float) * 2 * vol);
for(i=0;i<rank;i++) {
p->Ms[i] = vol / p->Ns[i];
p->plans[i] = NULL;
int k;
for(k=0;k<i;k++) {
if(p->Ms[k] == p->Ms[i])
p->plans[i] = p->plans[k];
}
if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
}
p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
return p;
}
ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) {
size_t Ns[2];
Ns[0] = N1;
Ns[1] = N2;
return ffts_init_nd(2, Ns, sign);
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

59
3rdparty/ffts/ffts-master/src/ffts_nd.h vendored Normal file
View File

@@ -0,0 +1,59 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FFTS_ND_H__
#define __FFTS_ND_H__
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
#include "ffts.h"
#ifdef HAVE_NEON
#include <arm_neon.h>
#endif
#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
void ffts_free_nd(ffts_plan_t *p);
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf);
void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out);
ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign);
ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,227 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_real.h"
void ffts_free_1d_real(ffts_plan_t *p) {
ffts_free(p->plans[0]);
free(p->A);
free(p->B);
free(p->plans);
free(p->buf);
free(p);
}
void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) {
float *out = (float *)vout;
float *buf = (float *)p->buf;
float *A = p->A;
float *B = p->B;
p->plans[0]->transform(p->plans[0], vin, buf);
size_t N = p->N;
buf[N] = buf[0];
buf[N+1] = buf[1];
float *p_buf0 = buf;
float *p_buf1 = buf + N - 2;
float *p_out = out;
size_t i;
#ifdef __ARM_NEON__
for(i=0;i<N/2;i+=2) {
__asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
"vld1.32 {q9}, [%[pb], :128]!\n\t"
"vld1.32 {q10}, [%[buf0], :128]!\n\t"
"vld1.32 {q11}, [%[buf1], :64]\n\t"
"sub %[buf1], %[buf1], #16\n\t"
"vdup.32 d26, d16[1]\n\t"
"vdup.32 d27, d17[1]\n\t"
"vdup.32 d24, d16[0]\n\t"
"vdup.32 d25, d17[0]\n\t"
"vdup.32 d30, d23[1]\n\t"
"vdup.32 d31, d22[1]\n\t"
"vdup.32 d28, d23[0]\n\t"
"vdup.32 d29, d22[0]\n\t"
"vmul.f32 q13, q13, q10\n\t"
"vmul.f32 q15, q15, q9\n\t"
"vmul.f32 q12, q12, q10\n\t"
"vmul.f32 q14, q14, q9\n\t"
"vrev64.f32 q13, q13\n\t"
"vrev64.f32 q15, q15\n\t"
"vtrn.32 d26, d27\n\t"
"vtrn.32 d30, d31\n\t"
"vneg.f32 d26, d26\n\t"
"vneg.f32 d31, d31\n\t"
"vtrn.32 d26, d27\n\t"
"vtrn.32 d30, d31\n\t"
"vadd.f32 q12, q12, q14\n\t"
"vadd.f32 q13, q13, q15\n\t"
"vadd.f32 q12, q12, q13\n\t"
"vst1.32 {q12}, [%[pout], :128]!\n\t"
: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
[pout] "+r" (p_out)
:
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
#else
for(i=0;i<N/2;i++) {
out[2*i] = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1];
out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i];
// out[2*N-2*i] = out[2*i];
// out[2*N-2*i+1] = -out[2*i+1];
#endif
}
out[N] = buf[0] - buf[1];
out[N+1] = 0.0f;
}
void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
float *out = (float *)vout;
float *in = (float *)vin;
float *buf = (float *)p->buf;
float *A = p->A;
float *B = p->B;
size_t N = p->N;
float *p_buf0 = in;
float *p_buf1 = in + N - 2;
float *p_out = buf;
size_t i;
#ifdef __ARM_NEON__
for(i=0;i<N/2;i+=2) {
__asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
"vld1.32 {q9}, [%[pb], :128]!\n\t"
"vld1.32 {q10}, [%[buf0], :128]!\n\t"
"vld1.32 {q11}, [%[buf1], :64]\n\t"
"sub %[buf1], %[buf1], #16\n\t"
"vdup.32 d26, d16[1]\n\t"
"vdup.32 d27, d17[1]\n\t"
"vdup.32 d24, d16[0]\n\t"
"vdup.32 d25, d17[0]\n\t"
"vdup.32 d30, d23[1]\n\t"
"vdup.32 d31, d22[1]\n\t"
"vdup.32 d28, d23[0]\n\t"
"vdup.32 d29, d22[0]\n\t"
"vmul.f32 q13, q13, q10\n\t"
"vmul.f32 q15, q15, q9\n\t"
"vmul.f32 q12, q12, q10\n\t"
"vmul.f32 q14, q14, q9\n\t"
"vrev64.f32 q13, q13\n\t"
"vrev64.f32 q15, q15\n\t"
"vtrn.32 d26, d27\n\t"
"vtrn.32 d28, d29\n\t"
"vneg.f32 d27, d27\n\t"
"vneg.f32 d29, d29\n\t"
"vtrn.32 d26, d27\n\t"
"vtrn.32 d28, d29\n\t"
"vadd.f32 q12, q12, q14\n\t"
"vsub.f32 q13, q13, q15\n\t"
"vadd.f32 q12, q12, q13\n\t"
"vst1.32 {q12}, [%[pout], :128]!\n\t"
: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
[pout] "+r" (p_out)
:
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
#else
for(i=0;i<N/2;i++) {
buf[2*i] = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
#endif
}
p->plans[0]->transform(p->plans[0], buf, out);
}
ffts_plan_t *ffts_init_1d_real(size_t N, int sign) {
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
if(sign < 0) p->transform = &ffts_execute_1d_real;
else p->transform = &ffts_execute_1d_real_inv;
p->destroy = &ffts_free_1d_real;
p->N = N;
p->rank = 1;
p->plans = malloc(sizeof(ffts_plan_t **) * 1);
p->plans[0] = ffts_init_1d(N/2, sign);
p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1));
p->A = valloc(sizeof(float) * N);
p->B = valloc(sizeof(float) * N);
if(sign < 0) {
int i;
for (i = 0; i < N/2; i++) {
p->A[2 * i] = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
p->B[2 * i] = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
}
}else{
int i;
for (i = 0; i < N/2; i++) {
p->A[2 * i] = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
p->B[2 * i] = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
}
}
return p;
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,54 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FFTS_REAL_H__
#define __FFTS_REAL_H__
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
#include "ffts.h"
#ifdef HAVE_NEON
#include <arm_neon.h>
#endif
#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,197 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_real_nd.h"
#ifdef __ARM_NEON__
#include "neon.h"
#endif
void ffts_free_nd_real(ffts_plan_t *p) {
int i;
for(i=0;i<p->rank;i++) {
ffts_plan_t *x = p->plans[i];
int k;
for(k=i+1;k<p->rank;k++) {
if(x == p->plans[k]) p->plans[k] = NULL;
}
if(x) ffts_free(x);
}
free(p->Ns);
free(p->Ms);
free(p->plans);
free(p->buf);
free(p->transpose_buf);
free(p);
}
void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
size_t i,j;
for(i=0;i<w;i+=1) {
for(j=0;j<h;j+=1) {
out[i*h + j] = in[j*w + i];
}
}
}
void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) {
uint32_t *din = (uint32_t *)in;
uint64_t *buf = p->buf;
uint64_t *dout = (uint64_t *)out;
size_t i,j;
for(i=0;i<p->Ns[0];i++) {
p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));
}
ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);
for(i=1;i<p->rank;i++) {
for(j=0;j<p->Ns[i];j++) {
p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
}
ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
}
}
void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) {
uint64_t *din = (uint64_t *)in;
uint64_t *buf = p->buf;
uint64_t *buf2;
uint64_t *dout = (uint64_t *)out;
size_t vol = 1;
float *bufr = (float *)(p->buf);
float *doutr = (float *)out;
size_t i,j;
for(i=0;i<p->rank;i++) {
vol *= p->Ns[i];
}
buf2 = buf + vol;
ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);
for(i=0;i<p->Ms[0];i++) {
p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0]));
}
ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf);
for(j=0;j<p->Ms[1];j++) {
p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);
}
}
ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
size_t vol = 1;
size_t bufsize;
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
if(sign < 0) p->transform = &ffts_execute_nd_real;
else p->transform = &ffts_execute_nd_real_inv;
p->destroy = &ffts_free_nd_real;
p->rank = rank;
p->Ns = malloc(sizeof(size_t) * rank);
p->Ms = malloc(sizeof(size_t) * rank);
p->plans = malloc(sizeof(ffts_plan_t **) * rank);
int i;
for(i=0;i<rank;i++) {
p->Ns[i] = Ns[i];
vol *= Ns[i];
}
//There is probably a prettier way of doing this, but it works..
if(sign < 0) {
bufsize = 2 * vol;
}
else {
bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
}
p->buf = valloc(sizeof(float) * bufsize);
for(i=0;i<rank;i++) {
p->Ms[i] = vol / p->Ns[i];
p->plans[i] = NULL;
int k;
if(sign < 0) {
for(k=1;k<i;k++) {
if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
}
if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
}else{
for(k=0;k<i;k++) {
if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
}
if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign);
}
}
if(sign < 0) {
for(i=1;i<rank;i++) {
p->Ns[i] = p->Ns[i] / 2 + 1;
}
}else{
for(i=0;i<rank-1;i++) {
p->Ms[i] = p->Ms[i] / 2 + 1;
}
}
p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
return p;
}
ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
size_t Ns[2];
Ns[0] = N1;
Ns[1] = N2;
return ffts_init_nd_real(2, Ns, sign);
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,54 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FFTS_REAL_ND_H__
#define __FFTS_REAL_ND_H__
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
#include "ffts_nd.h"
#include "ffts_real.h"
#include "ffts.h"
#ifdef HAVE_NEON
#include <arm_neon.h>
#endif
#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,157 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts.h"
#include "macros.h"
#include <stdlib.h>
#define DEBUG(x)
#include "ffts_small.h"
void firstpass_16_f(ffts_plan_t * p, const void * in, void * out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
float *LUT8 = p->ws;
L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
}
void firstpass_16_b(ffts_plan_t * p, const void * in, void * out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
float *LUT8 = p->ws;
L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
}
void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
V r0_1, r2_3, r4_5, r6_7;
float *LUT8 = p->ws + p->ws_is[0];
L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
}
void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
V r0_1, r2_3, r4_5, r6_7;
float *LUT8 = p->ws + p->ws_is[0];
L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
}
void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
t0[0] = din[0]; t0[1] = din[1];
t1[0] = din[4]; t1[1] = din[5];
t2[0] = din[2]; t2[1] = din[3];
t3[0] = din[6]; t3[1] = din[7];
t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
}
void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
t0[0] = din[0]; t0[1] = din[1];
t1[0] = din[4]; t1[1] = din[5];
t2[0] = din[2]; t2[1] = din[3];
t3[0] = din[6]; t3[1] = din[7];
t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
}
void firstpass_2(ffts_plan_t *p, const void *in, void *out)
{
const data_t *din = (const data_t *)in;
data_t *dout = (data_t *)out;
cdata_t t0, t1, r0,r1;
t0[0] = din[0]; t0[1] = din[1];
t1[0] = din[2]; t1[1] = din[3];
r0[0] = t0[0] + t1[0];
r0[1] = t0[1] + t1[1];
r1[0] = t0[0] - t1[0];
r1[1] = t0[1] - t1[1];
dout[0] = r0[0]; dout[1] = r0[1];
dout[2] = r1[0]; dout[3] = r1[1];
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,14 @@
#ifndef __FFTS_SMALL_H__
#define __FFTS_SMALL_H__
void firstpass_16_f(ffts_plan_t * p, const void * in, void * out);
void firstpass_16_b(ffts_plan_t * p, const void * in, void * out);
void firstpass_8_f(ffts_plan_t * p, const void * in, void * out);
void firstpass_8_b(ffts_plan_t * p, const void * in, void * out);
void firstpass_4_f(ffts_plan_t * p, const void * in, void * out);
void firstpass_4_b(ffts_plan_t * p, const void * in, void * out);
void firstpass_2(ffts_plan_t * p, const void * in, void * out);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,102 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_static.h"
void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) {
if(N > 16) {
size_t N1 = N >> 1;
size_t N2 = N >> 2;
size_t N3 = N >> 3;
float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
ffts_static_rec_i(p, data, N2);
ffts_static_rec_i(p, data + N1, N3);
ffts_static_rec_i(p, data + N1 + N2, N3);
ffts_static_rec_i(p, data + N, N2);
ffts_static_rec_i(p, data + N + N1, N2);
if(N == p->N) {
neon_static_x8_t_i(data, N, ws);
}else{
neon_static_x8_i(data, N, ws);
}
}else if(N==16){
neon_static_x4_i(data, N, p->ws);
}
}
void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) {
if(N > 16) {
size_t N1 = N >> 1;
size_t N2 = N >> 2;
size_t N3 = N >> 3;
float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
ffts_static_rec_f(p, data, N2);
ffts_static_rec_f(p, data + N1, N3);
ffts_static_rec_f(p, data + N1 + N2, N3);
ffts_static_rec_f(p, data + N, N2);
ffts_static_rec_f(p, data + N + N1, N2);
if(N == p->N) {
neon_static_x8_t_f(data, N, ws);
}else{
neon_static_x8_f(data, N, ws);
}
}else if(N==16){
neon_static_x4_f(data, N, p->ws);
}
}
void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out) {
if(__builtin_ctzl(p->N) & 1)
neon_static_o_f(p, in, out);
else
neon_static_e_f(p, in, out);
ffts_static_rec_f(p, out, p->N);
}
void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out) {
if(__builtin_ctzl(p->N) & 1)
neon_static_o_i(p, in, out);
else
neon_static_e_i(p, in, out);
ffts_static_rec_i(p, out, p->N);
}
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,47 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __FFTS_STATIC_H__
#define __FFTS_STATIC_H__
#include "ffts.h"
#include "neon.h"
void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ;
void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out);
void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ;
void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,207 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __MACROS_ALPHA_H__
#define __MACROS_ALPHA_H__
#include <math.h>
#ifdef __alpha__
#define restrict
#endif
typedef struct {float r1, i1, r2, i2;} V;
#define FFTS_MALLOC(d,a) malloc(d)
#define FFTS_FREE(d) free(d)
#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3})
static inline V VADD(V x, V y)
{
V z;
z.r1 = x.r1 + y.r1;
z.i1 = x.i1 + y.i1;
z.r2 = x.r2 + y.r2;
z.i2 = x.i2 + y.i2;
return z;
}
static inline V VSUB(V x, V y)
{
V z;
z.r1 = x.r1 - y.r1;
z.i1 = x.i1 - y.i1;
z.r2 = x.r2 - y.r2;
z.i2 = x.i2 - y.i2;
return z;
}
static inline V VMUL(V x, V y)
{
V z;
z.r1 = x.r1 * y.r1;
z.i1 = x.i1 * y.i1;
z.r2 = x.r2 * y.r2;
z.i2 = x.i2 * y.i2;
return z;
}
static inline V VXOR(V x, V y)
{
V r;
r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1;
r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1;
r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2;
r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2;
return r;
}
static inline V VSWAPPAIRS(V x)
{
V z;
z.r1 = x.i1;
z.i1 = x.r1;
z.r2 = x.i2;
z.i2 = x.r2;
return z;
}
static inline V VBLEND(V x, V y)
{
V z;
z.r1 = x.r1;
z.i1 = x.i1;
z.r2 = y.r2;
z.i2 = y.i2;
return z;
}
static inline V VUNPACKHI(V x, V y)
{
V z;
z.r1 = x.r2;
z.i1 = x.i2;
z.r2 = y.r2;
z.i2 = y.i2;
return z;
}
static inline V VUNPACKLO(V x, V y)
{
V z;
z.r1 = x.r1;
z.i1 = x.i1;
z.r2 = y.r1;
z.i2 = y.i1;
return z;
}
static inline V VDUPRE(V x)
{
V z;
z.r1 = x.r1;
z.i1 = x.r1;
z.r2 = x.r2;
z.i2 = x.r2;
return z;
}
static inline V VDUPIM(V x)
{
V z;
z.r1 = x.i1;
z.i1 = x.i1;
z.r2 = x.i2;
z.i2 = x.i2;
return z;
}
static inline V IMUL(V d, V re, V im)
{
re = VMUL(re, d);
im = VMUL(im, VSWAPPAIRS(d));
return VSUB(re, im);
}
static inline V IMULJ(V d, V re, V im)
{
re = VMUL(re, d);
im = VMUL(im, VSWAPPAIRS(d));
return VADD(re, im);
}
static inline V MULI(int inv, V x)
{
V z;
if (inv) {
z.r1 = -x.r1;
z.i1 = x.i1;
z.r2 = -x.r2;
z.i2 = x.i2;
}else{
z.r1 = x.r1;
z.i1 = -x.i1;
z.r2 = x.r2;
z.i2 = -x.i2;
}
return z;
}
static inline V IMULI(int inv, V x)
{
return VSWAPPAIRS(MULI(inv, x));
}
static inline V VLD(const void *s)
{
V *d = (V *)s;
return *d;
}
static inline void VST(void *d, V s)
{
V *r = (V *)d;
*r = s;
}
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,138 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __MACROS_ALTIVEC_H__
#define __MACROS_ALTIVEC_H__
#include <math.h>
#include <altivec.h>
#define restrict
typedef vector float V;
typedef vector unsigned char VUC;
#ifdef __apple__
#define FFTS_MALLOC(d,a) vec_malloc(d)
#define FFTS_FREE(d) vec_free(d)
#else
/* It appears vec_malloc() and friends are not implemented on Linux */
#include <malloc.h>
#define FFTS_MALLOC(d,a) memalign(16,d)
#define FFTS_FREE(d) free(d)
#endif
#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
#define VADD(x,y) vec_add(x,y)
#define VSUB(x,y) vec_sub(x,y)
#define VMUL(x,y) vec_madd(x,y,(V){0})
#define VMULADD(x,y,z) vec_madd(x,y,z)
#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
#define VXOR(x,y) vec_xor((x),(y))
#define VSWAPPAIRS(x) \
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \
0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
#define VBLEND(x,y) \
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
#define VUNPACKHI(x,y) \
vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
#define VUNPACKLO(x,y) \
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
#define VDUPRE(x) \
vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \
0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
#define VDUPIM(x) \
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \
0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
static inline V IMUL(V d, V re, V im)
{
im = VMUL(im, VSWAPPAIRS(d));
re = VMUL(re, d);
return VSUB(re, im);
}
static inline V IMULJ(V d, V re, V im)
{
im = VMUL(im, VSWAPPAIRS(d));
return VMULADD(re, d, im);
}
#ifndef __GNUC__
/* gcc (4.6 and 4.7) ICEs on this code! */
static inline V MULI(int inv, V x)
{
return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
}
#else
/* but compiles this fine... */
static inline V MULI(int inv, V x)
{
V t;
t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
return VXOR(x, t);
}
#endif
static inline V IMULI(int inv, V x)
{
return VSWAPPAIRS(MULI(inv, x));
}
static inline V VLD(const void *s)
{
V *d = (V *)s;
return *d;
}
static inline void VST(void *d, V s)
{
V *r = (V *)d;
*r = s;
}
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,97 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __MACROS_NEON_H__
#define __MACROS_NEON_H__
#include "neon.h"
#include <arm_neon.h>
typedef float32x4_t V;
typedef float32x4x2_t VS;
#define ADD vaddq_f32
#define SUB vsubq_f32
#define MUL vmulq_f32
#define VADD vaddq_f32
#define VSUB vsubq_f32
#define VMUL vmulq_f32
#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
#define VST vst1q_f32
#define VLD vld1q_f32
#define VST2 vst2q_f32
#define VLD2 vld2q_f32
#define VSWAPPAIRS(x) (vrev64q_f32(x))
#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
return VLD(d);
}
#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
#define FFTS_MALLOC(d,a) (valloc(d))
#define FFTS_FREE(d) (free(d))
__INLINE void STORESPR(data_t * addr, VS p) {
vst1q_f32(addr, p.val[0]);
vst1q_f32(addr + 4, p.val[1]);
}
__INLINE V IMULI(int inv, V a) {
if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
}
__INLINE V IMUL(V d, V re, V im) {
re = VMUL(re, d);
im = VMUL(im, VSWAPPAIRS(d));
return VSUB(re, im);
}
__INLINE V IMULJ(V d, V re, V im) {
re = VMUL(re, d);
im = VMUL(im, VSWAPPAIRS(d));
return VADD(re, im);
}
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,85 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SSE_FLOAT_H__
#define __SSE_FLOAT_H__
#include <xmmintrin.h>
//#define VL 4
typedef __m128 V;
#define VADD _mm_add_ps
#define VSUB _mm_sub_ps
#define VMUL _mm_mul_ps
//#define VLIT4 _mm_set_ps
#define VXOR _mm_xor_ps
#define VST _mm_store_ps
#define VLD _mm_load_ps
#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)))
#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2)))
#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0)))
#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0)))
#define VLIT4 _mm_set_ps
#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0)))
#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1)))
#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
#define FFTS_FREE(d) (_mm_free(d))
__INLINE V IMULI(int inv, V a) {
if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
}
__INLINE V IMUL(V d, V re, V im) {
re = VMUL(re, d);
im = VMUL(im, VSWAPPAIRS(d));
return VSUB(re, im);
}
__INLINE V IMULJ(V d, V re, V im) {
re = VMUL(re, d);
im = VMUL(im, VSWAPPAIRS(d));
return VADD(re, im);
}
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

162
3rdparty/ffts/ffts-master/src/macros.h vendored Normal file
View File

@@ -0,0 +1,162 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __MACROS_H__
#define __MACROS_H__
#ifdef HAVE_NEON
#include "macros-neon.h"
#else
#ifdef __alpha__
#include "macros-alpha.h"
#else
#ifdef __powerpc__
#include "macros-altivec.h"
#endif
#endif
#endif
#ifdef HAVE_VFP
#include "macros-alpha.h"
#endif
#ifdef HAVE_SSE
#include "macros-sse.h"
#endif
static inline void TX2(V *a, V *b)
{
V TX2_t0 = VUNPACKLO(*a, *b);
V TX2_t1 = VUNPACKHI(*a, *b);
*a = TX2_t0; *b = TX2_t1;
}
static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
{
V uk, uk2, zk_p, zk_n, zk, zk_d;
uk = *r0; uk2 = *r1;
zk_p = IMUL(*r2, re, im);
zk_n = IMULJ(*r3, re, im);
zk = VADD(zk_p, zk_n);
zk_d = IMULI(inv, VSUB(zk_p, zk_n));
*r2 = VSUB(uk, zk);
*r0 = VADD(uk, zk);
*r3 = VADD(uk2, zk_d);
*r1 = VSUB(uk2, zk_d);
}
static inline void S_4(V r0, V r1, V r2, V r3,
data_t * restrict o0, data_t * restrict o1,
data_t * restrict o2, data_t * restrict o3)
{
VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
}
static inline void L_2_4(int inv,
const data_t * restrict i0, const data_t * restrict i1,
const data_t * restrict i2, const data_t * restrict i3,
V *r0, V *r1, V *r2, V *r3)
{
V t0, t1, t2, t3, t4, t5, t6, t7;
t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
t4 = VADD(t0, t1);
t5 = VSUB(t0, t1);
t6 = VADD(t2, t3);
t7 = VSUB(t2, t3);
*r0 = VUNPACKLO(t4, t5);
*r1 = VUNPACKLO(t6, t7);
t5 = IMULI(inv, t5);
t0 = VADD(t6, t4);
t2 = VSUB(t6, t4);
t1 = VSUB(t7, t5);
t3 = VADD(t7, t5);
*r3 = VUNPACKHI(t0, t1);
*r2 = VUNPACKHI(t2, t3);
}
static inline void L_4_4(int inv,
const data_t * restrict i0, const data_t * restrict i1,
const data_t * restrict i2, const data_t * restrict i3,
V *r0, V *r1, V *r2, V *r3)
{
V t0, t1, t2, t3, t4, t5, t6, t7;
t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
t4 = VADD(t0, t1);
t5 = VSUB(t0, t1);
t6 = VADD(t2, t3);
t7 = IMULI(inv, VSUB(t2, t3));
t0 = VADD(t4, t6);
t2 = VSUB(t4, t6);
t1 = VSUB(t5, t7);
t3 = VADD(t5, t7);
TX2(&t0, &t1);
TX2(&t2, &t3);
*r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3;
}
static inline void L_4_2(int inv,
const data_t * restrict i0, const data_t * restrict i1,
const data_t * restrict i2, const data_t * restrict i3,
V *r0, V *r1, V *r2, V *r3)
{
V t0, t1, t2, t3, t4, t5, t6, t7;
t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3);
t2 = VBLEND(t6, t7);
t3 = VBLEND(t7, t6);
t4 = VADD(t0, t1);
t5 = VSUB(t0, t1);
t6 = VADD(t2, t3);
t7 = VSUB(t2, t3);
*r2 = VUNPACKHI(t4, t5);
*r3 = VUNPACKHI(t6, t7);
t7 = IMULI(inv, t7);
t0 = VADD(t4, t6);
t2 = VSUB(t4, t6);
t1 = VSUB(t5, t7);
t3 = VADD(t5, t7);
*r0 = VUNPACKLO(t0, t1);
*r1 = VUNPACKLO(t2, t3);
}
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

66
3rdparty/ffts/ffts-master/src/neon.h vendored Normal file
View File

@@ -0,0 +1,66 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __NEON_H__
#define __NEON_H__
#include "ffts.h"
void neon_x4(float *, size_t, float *);
void neon_x8(float *, size_t, float *);
void neon_x8_t(float *, size_t, float *);
void neon_ee();
void neon_oo();
void neon_eo();
void neon_oe();
void neon_end();
void neon_transpose(uint64_t *in, uint64_t *out, int w, int h);
void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w);
//typedef struct _ffts_plan_t ffts_plan_t;
void neon_static_e_f(ffts_plan_t * , const void * , void * );
void neon_static_o_f(ffts_plan_t * , const void * , void * );
void neon_static_x4_f(float *, size_t, float *);
void neon_static_x8_f(float *, size_t, float *);
void neon_static_x8_t_f(float *, size_t, float *);
void neon_static_e_i(ffts_plan_t * , const void * , void * );
void neon_static_o_i(ffts_plan_t * , const void * , void * );
void neon_static_x4_i(float *, size_t, float *);
void neon_static_x8_i(float *, size_t, float *);
void neon_static_x8_t_i(float *, size_t, float *);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

738
3rdparty/ffts/ffts-master/src/neon.s vendored Normal file
View File

@@ -0,0 +1,738 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.align 4
#ifdef __APPLE__
.globl _neon_x4
_neon_x4:
#else
.globl neon_x4
neon_x4:
#endif
@ add r3, r0, #0
vld1.32 {q8,q9}, [r0, :128]
add r4, r0, r1, lsl #1
vld1.32 {q10,q11}, [r4, :128]
add r5, r0, r1, lsl #2
vld1.32 {q12,q13}, [r5, :128]
add r6, r4, r1, lsl #2
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q2,q3}, [r2, :128]
vmul.f32 q0, q13, q3
vmul.f32 q5, q12, q2
vmul.f32 q1, q14, q2
vmul.f32 q4, q14, q3
vmul.f32 q14, q12, q3
vmul.f32 q13, q13, q2
vmul.f32 q12, q15, q3
vmul.f32 q2, q15, q2
vsub.f32 q0, q5, q0
vadd.f32 q13, q13, q14
vadd.f32 q12, q12, q1
vsub.f32 q1, q2, q4
vadd.f32 q15, q0, q12
vsub.f32 q12, q0, q12
vadd.f32 q14, q13, q1
vsub.f32 q13, q13, q1
vadd.f32 q0, q8, q15
vadd.f32 q1, q9, q14
vsub.f32 q2, q10, q13 @
vsub.f32 q4, q8, q15
vadd.f32 q3, q11, q12 @
vst1.32 {q0,q1}, [r0, :128]
vsub.f32 q5, q9, q14
vadd.f32 q6, q10, q13 @
vsub.f32 q7, q11, q12 @
vst1.32 {q2,q3}, [r4, :128]
vst1.32 {q4,q5}, [r5, :128]
vst1.32 {q6,q7}, [r6, :128]
bx lr
.align 4
#ifdef __APPLE__
.globl _neon_x8
_neon_x8:
#else
.globl neon_x8
neon_x8:
#endif
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
neon_x8_loop:
vld1.32 {q2,q3}, [r12, :128]!
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q10,q11}, [r5, :128]
adds r11, r11, #1
vmul.f32 q12, q15, q2
vmul.f32 q8, q14, q3
vmul.f32 q13, q14, q2
vmul.f32 q9, q10, q3
vmul.f32 q1, q10, q2
vmul.f32 q0, q11, q2
vmul.f32 q14, q11, q3
vmul.f32 q15, q15, q3
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q10, q12, q8
vadd.f32 q11, q0, q9
vadd.f32 q8, q15, q13
vld1.32 {q12,q13}, [r4, :128]
vsub.f32 q9, q1, q14
vsub.f32 q15, q11, q10
vsub.f32 q14, q9, q8
vsub.f32 q4, q12, q15 @
vadd.f32 q6, q12, q15 @
vadd.f32 q5, q13, q14 @
vsub.f32 q7, q13, q14 @
vld1.32 {q14,q15}, [r9, :128]
vld1.32 {q12,q13}, [r7, :128]
vmul.f32 q1, q14, q2
vmul.f32 q0, q14, q3
vst1.32 {q4,q5}, [r4, :128]
vmul.f32 q14, q15, q3
vmul.f32 q4, q15, q2
vadd.f32 q15, q9, q8
vst1.32 {q6,q7}, [r6, :128]
vmul.f32 q8, q12, q3
vmul.f32 q5, q13, q3
vmul.f32 q12, q12, q2
vmul.f32 q9, q13, q2
vadd.f32 q14, q14, q1
vsub.f32 q13, q4, q0
vadd.f32 q0, q9, q8
vld1.32 {q8,q9}, [r3, :128]
vadd.f32 q1, q11, q10
vsub.f32 q12, q12, q5
vadd.f32 q11, q8, q15
vsub.f32 q8, q8, q15
vadd.f32 q2, q12, q14
vsub.f32 q10, q0, q13
vadd.f32 q15, q0, q13
vadd.f32 q13, q9, q1
vsub.f32 q9, q9, q1
vsub.f32 q12, q12, q14
vadd.f32 q0, q11, q2
vadd.f32 q1, q13, q15
vsub.f32 q4, q11, q2
vsub.f32 q2, q8, q10 @
vadd.f32 q3, q9, q12 @
vst1.32 {q0,q1}, [r3, :128]!
vsub.f32 q5, q13, q15
vld1.32 {q14,q15}, [r10, :128]
vsub.f32 q7, q9, q12 @
vld1.32 {q12,q13}, [r8, :128]
vst1.32 {q2,q3}, [r5, :128]!
vld1.32 {q2,q3}, [r12, :128]!
vadd.f32 q6, q8, q10 @
vmul.f32 q8, q14, q2
vst1.32 {q4,q5}, [r7, :128]!
vmul.f32 q10, q15, q3
vmul.f32 q9, q13, q3
vmul.f32 q11, q12, q2
vmul.f32 q14, q14, q3
vst1.32 {q6,q7}, [r9, :128]!
vmul.f32 q15, q15, q2
vmul.f32 q12, q12, q3
vmul.f32 q13, q13, q2
vadd.f32 q10, q10, q8
vsub.f32 q11, q11, q9
vld1.32 {q8,q9}, [r4, :128]
vsub.f32 q14, q15, q14
vadd.f32 q15, q13, q12
vadd.f32 q13, q11, q10
vadd.f32 q12, q15, q14
vsub.f32 q15, q15, q14
vsub.f32 q14, q11, q10
vld1.32 {q10,q11}, [r6, :128]
vadd.f32 q0, q8, q13
vadd.f32 q1, q9, q12
vsub.f32 q2, q10, q15 @
vadd.f32 q3, q11, q14 @
vsub.f32 q4, q8, q13
vst1.32 {q0,q1}, [r4, :128]!
vsub.f32 q5, q9, q12
vadd.f32 q6, q10, q15 @
vst1.32 {q2,q3}, [r6, :128]!
vsub.f32 q7, q11, q14 @
vst1.32 {q4,q5}, [r8, :128]!
vst1.32 {q6,q7}, [r10, :128]!
bne neon_x8_loop
bx lr
.align 4
#ifdef __APPLE__
.globl _neon_x8_t
_neon_x8_t:
#else
.globl neon_x8_t
neon_x8_t:
#endif
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
neon_x8_t_loop:
vld1.32 {q2,q3}, [r12, :128]!
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q10,q11}, [r5, :128]
adds r11, r11, #1
vmul.f32 q12, q15, q2
vmul.f32 q8, q14, q3
vmul.f32 q13, q14, q2
vmul.f32 q9, q10, q3
vmul.f32 q1, q10, q2
vmul.f32 q0, q11, q2
vmul.f32 q14, q11, q3
vmul.f32 q15, q15, q3
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q10, q12, q8
vadd.f32 q11, q0, q9
vadd.f32 q8, q15, q13
vld1.32 {q12,q13}, [r4, :128]
vsub.f32 q9, q1, q14
vsub.f32 q15, q11, q10
vsub.f32 q14, q9, q8
vsub.f32 q4, q12, q15 @
vadd.f32 q6, q12, q15 @
vadd.f32 q5, q13, q14 @
vsub.f32 q7, q13, q14 @
vld1.32 {q14,q15}, [r9, :128]
vld1.32 {q12,q13}, [r7, :128]
vmul.f32 q1, q14, q2
vmul.f32 q0, q14, q3
vst1.32 {q4,q5}, [r4, :128]
vmul.f32 q14, q15, q3
vmul.f32 q4, q15, q2
vadd.f32 q15, q9, q8
vst1.32 {q6,q7}, [r6, :128]
vmul.f32 q8, q12, q3
vmul.f32 q5, q13, q3
vmul.f32 q12, q12, q2
vmul.f32 q9, q13, q2
vadd.f32 q14, q14, q1
vsub.f32 q13, q4, q0
vadd.f32 q0, q9, q8
vld1.32 {q8,q9}, [r3, :128]
vadd.f32 q1, q11, q10
vsub.f32 q12, q12, q5
vadd.f32 q11, q8, q15
vsub.f32 q8, q8, q15
vadd.f32 q2, q12, q14
vsub.f32 q10, q0, q13
vadd.f32 q15, q0, q13
vadd.f32 q13, q9, q1
vsub.f32 q9, q9, q1
vsub.f32 q12, q12, q14
vadd.f32 q0, q11, q2
vadd.f32 q1, q13, q15
vsub.f32 q4, q11, q2
vsub.f32 q2, q8, q10 @
vadd.f32 q3, q9, q12 @
vst2.32 {q0,q1}, [r3, :128]!
vsub.f32 q5, q13, q15
vld1.32 {q14,q15}, [r10, :128]
vsub.f32 q7, q9, q12 @
vld1.32 {q12,q13}, [r8, :128]
vst2.32 {q2,q3}, [r5, :128]!
vld1.32 {q2,q3}, [r12, :128]!
vadd.f32 q6, q8, q10 @
vmul.f32 q8, q14, q2
vst2.32 {q4,q5}, [r7, :128]!
vmul.f32 q10, q15, q3
vmul.f32 q9, q13, q3
vmul.f32 q11, q12, q2
vmul.f32 q14, q14, q3
vst2.32 {q6,q7}, [r9, :128]!
vmul.f32 q15, q15, q2
vmul.f32 q12, q12, q3
vmul.f32 q13, q13, q2
vadd.f32 q10, q10, q8
vsub.f32 q11, q11, q9
vld1.32 {q8,q9}, [r4, :128]
vsub.f32 q14, q15, q14
vadd.f32 q15, q13, q12
vadd.f32 q13, q11, q10
vadd.f32 q12, q15, q14
vsub.f32 q15, q15, q14
vsub.f32 q14, q11, q10
vld1.32 {q10,q11}, [r6, :128]
vadd.f32 q0, q8, q13
vadd.f32 q1, q9, q12
vsub.f32 q2, q10, q15 @
vadd.f32 q3, q11, q14 @
vsub.f32 q4, q8, q13
vst2.32 {q0,q1}, [r4, :128]!
vsub.f32 q5, q9, q12
vadd.f32 q6, q10, q15 @
vst2.32 {q2,q3}, [r6, :128]!
vsub.f32 q7, q11, q14 @
vst2.32 {q4,q5}, [r8, :128]!
vst2.32 {q6,q7}, [r10, :128]!
bne neon_x8_t_loop
@bx lr
@ assumes r0 = out
@ r1 = in ?
@
@ r12 = offsets
@ r3-r10 = data pointers
@ r11 = loop iterations
@ r2 & lr = temps
.align 4
#ifdef __APPLE__
.globl _neon_ee
_neon_ee:
#else
.globl neon_ee
neon_ee:
#endif
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_loop:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vadd.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vsub.f32 d31, d5, d2 @
vsub.f32 d28, d4, d3 @
vadd.f32 d30, d4, d3 @
vadd.f32 d5, d19, d14 @-
vadd.f32 d7, d31, d26 @-
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vsub.f32 d6, d30, d27 @-
vsub.f32 d4, d18, d15 @-
vsub.f32 d13, d19, d14 @-
vadd.f32 d12, d18, d15 @-
vsub.f32 d15, d31, d26 @-
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vadd.f32 d14, d30, d27 @-
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_loop
@ assumes r0 = out
@
@ r12 = offsets
@ r3-r10 = data pointers
@ r11 = loop iterations
@ r2 & lr = temps
.align 4
#ifdef __APPLE__
.globl _neon_oo
_neon_oo:
#else
.globl neon_oo
neon_oo:
#endif
_neon_oo_loop:
vld2.32 {q8}, [r6, :128]!
vld2.32 {q9}, [r5, :128]!
vld2.32 {q10}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vadd.f32 q11, q9, q8
vsub.f32 q8, q9, q8
vsub.f32 q9, q13, q10
vadd.f32 q12, q13, q10
subs r11, r11, #1
vld2.32 {q10}, [r7, :128]!
vld2.32 {q13}, [r9, :128]!
vsub.f32 q2, q12, q11
vsub.f32 d7, d19, d16 @
vadd.f32 d3, d19, d16 @
vadd.f32 d6, d18, d17 @
vsub.f32 d2, d18, d17 @
vld2.32 {q9}, [r8, :128]!
vld2.32 {q8}, [r10, :128]!
vadd.f32 q0, q12, q11
vadd.f32 q11, q13, q8
vadd.f32 q12, q10, q9
vsub.f32 q8, q13, q8
vsub.f32 q9, q10, q9
vsub.f32 q6, q12, q11
vadd.f32 q4, q12, q11
vtrn.32 q0, q2
ldr r2, [r12], #4
vsub.f32 d15, d19, d16 @
ldr lr, [r12], #4
vadd.f32 d11, d19, d16 @
vadd.f32 d14, d18, d17 @
vsub.f32 d10, d18, d17 @
add r2, r0, r2, lsl #2
vtrn.32 q1, q3
add lr, r0, lr, lsl #2
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_oo_loop
@ assumes r0 = out
@
@ r12 = offsets
@ r3-r10 = data pointers
@ r11 = addr of twiddle
@ r2 & lr = temps
.align 4
#ifdef __APPLE__
.globl _neon_eo
_neon_eo:
#else
.globl neon_eo
neon_eo:
#endif
vld2.32 {q9}, [r5, :128]! @tag2
vld2.32 {q13}, [r3, :128]! @tag0
vld2.32 {q12}, [r4, :128]! @tag1
vld2.32 {q0}, [r7, :128]! @tag4
vsub.f32 q11, q13, q12
vld2.32 {q8}, [r6, :128]! @tag3
vadd.f32 q12, q13, q12
vsub.f32 q10, q9, q8
vadd.f32 q8, q9, q8
vadd.f32 q9, q12, q8
vadd.f32 d9, d23, d20 @
vsub.f32 d11, d23, d20 @
vsub.f32 q8, q12, q8
vsub.f32 d8, d22, d21 @
vadd.f32 d10, d22, d21 @
ldr r2, [r12], #4
vld1.32 {d20, d21}, [r11, :128]
ldr lr, [r12], #4
vtrn.32 q9, q4
add r2, r0, r2, lsl #2
vtrn.32 q8, q5
add lr, r0, lr, lsl #2
vswp d9,d10
vst1.32 {d8,d9,d10,d11}, [lr, :128]!
vld2.32 {q13}, [r10, :128]! @tag7
vld2.32 {q15}, [r9, :128]! @tag6
vld2.32 {q11}, [r8, :128]! @tag5
vsub.f32 q14, q15, q13
vsub.f32 q12, q0, q11
vadd.f32 q11, q0, q11
vadd.f32 q13, q15, q13
vadd.f32 d13, d29, d24 @
vadd.f32 q15, q13, q11
vsub.f32 d12, d28, d25 @
vsub.f32 d15, d29, d24 @
vadd.f32 d14, d28, d25 @
vtrn.32 q15, q6
vsub.f32 q15, q13, q11
vtrn.32 q15, q7
vswp d13, d14
vst1.32 {d12,d13,d14,d15}, [lr, :128]!
vtrn.32 q13, q14
vtrn.32 q11, q12
vmul.f32 d24, d26, d21
vmul.f32 d28, d27, d20
vmul.f32 d25, d26, d20
vmul.f32 d26, d27, d21
vmul.f32 d27, d22, d21
vmul.f32 d30, d23, d20
vmul.f32 d29, d23, d21
vmul.f32 d22, d22, d20
vsub.f32 d21, d28, d24
vadd.f32 d20, d26, d25
vadd.f32 d25, d30, d27
vsub.f32 d24, d22, d29
vadd.f32 q11, q12, q10
vsub.f32 q10, q12, q10
vadd.f32 q0, q9, q11
vsub.f32 q2, q9, q11
vadd.f32 d3, d17, d20 @
vsub.f32 d7, d17, d20 @
vsub.f32 d2, d16, d21 @
vadd.f32 d6, d16, d21 @
vswp d1, d2
vswp d5, d6
vstmia r2!, {q0-q3}
@ assumes r0 = out
@
@ r12 = offsets
@ r3-r10 = data pointers
@ r11 = addr of twiddle
@ r2 & lr = temps
.align 4
#ifdef __APPLE__
.globl _neon_oe
_neon_oe:
#else
.globl neon_oe
neon_oe:
#endif
vld1.32 {q8}, [r5, :128]!
vld1.32 {q10}, [r6, :128]!
vld2.32 {q11}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vld2.32 {q15}, [r10, :128]!
vorr d25, d17, d17
vorr d24, d20, d20
vorr d20, d16, d16
vsub.f32 q9, q13, q11
vadd.f32 q11, q13, q11
ldr r2, [r12], #4
vtrn.32 d24, d25
ldr lr, [r12], #4
vtrn.32 d20, d21
add r2, r0, r2, lsl #2
vsub.f32 q8, q10, q12
add lr, r0, lr, lsl #2
vadd.f32 q10, q10, q12
vadd.f32 q0, q11, q10
vadd.f32 d25, d19, d16 @
vsub.f32 d27, d19, d16 @
vsub.f32 q1, q11, q10
vsub.f32 d24, d18, d17 @
vadd.f32 d26, d18, d17 @
vtrn.32 q0, q12
vtrn.32 q1, q13
vld1.32 {d24, d25}, [r11, :128]
vswp d1, d2
vst1.32 {q0, q1}, [r2, :128]!
vld2.32 {q0}, [r9, :128]!
vadd.f32 q1, q0, q15
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vsub.f32 q15, q0, q15
vsub.f32 q0, q14, q13
vadd.f32 q3, q14, q13
vadd.f32 q2, q3, q1
vadd.f32 d29, d1, d30 @
vsub.f32 d27, d1, d30 @
vsub.f32 q3, q3, q1
vsub.f32 d28, d0, d31 @
vadd.f32 d26, d0, d31 @
vtrn.32 q2, q14
vtrn.32 q3, q13
vswp d5, d6
vst1.32 {q2, q3}, [r2, :128]!
vtrn.32 q11, q9
vtrn.32 q10, q8
vmul.f32 d20, d18, d25
vmul.f32 d22, d19, d24
vmul.f32 d21, d19, d25
vmul.f32 d18, d18, d24
vmul.f32 d19, d16, d25
vmul.f32 d30, d17, d24
vmul.f32 d23, d16, d24
vmul.f32 d24, d17, d25
vadd.f32 d17, d22, d20
vsub.f32 d16, d18, d21
vsub.f32 d21, d30, d19
vadd.f32 d20, d24, d23
vadd.f32 q9, q8, q10
vsub.f32 q8, q8, q10
vadd.f32 q4, q14, q9
vsub.f32 q6, q14, q9
vadd.f32 d11, d27, d16 @
vsub.f32 d15, d27, d16 @
vsub.f32 d10, d26, d17 @
vadd.f32 d14, d26, d17 @
vswp d9, d10
vswp d13, d14
vstmia lr!, {q4-q7}
.align 4
#ifdef __APPLE__
.globl _neon_end
_neon_end:
#else
.globl neon_end
neon_end:
#endif
bx lr
.align 4
#ifdef __APPLE__
.globl _neon_transpose
_neon_transpose:
#else
.globl neon_transpose
neon_transpose:
#endif
push {r4-r8}
@ vpush {q8-q9}
mov r5, r3
_neon_transpose_col:
mov r7, r1
add r8, r1, r3, lsl #3
mov r4, r2
add r6, r0, r2, lsl #3
_neon_transpose_row:
vld1.32 {q8,q9}, [r0, :128]!
@ vld1.32 {q10,q11}, [r0, :128]!
vld1.32 {q12,q13}, [r6, :128]!
@ vld1.32 {q14,q15}, [r6, :128]!
sub r4, r4, #4
cmp r4, #0
vswp d17,d24
vswp d19,d26
vswp d21,d28
vswp d23,d30
vst1.32 {q8}, [r7, :128]
vst1.32 {q12}, [r8, :128]
add r7, r7, r3, lsl #4
add r8, r8, r3, lsl #4
vst1.32 {q9}, [r7, :128]
vst1.32 {q13}, [r8, :128]
add r7, r7, r3, lsl #4
add r8, r8, r3, lsl #4
@@vst1.32 {q10}, [r7, :128]
@@vst1.32 {q14}, [r8, :128]
@@add r7, r7, r3, lsl #4
@@add r8, r8, r3, lsl #4
@@vst1.32 {q11}, [r7, :128]
@@vst1.32 {q15}, [r8, :128]
@@add r7, r7, r3, lsl #4
@@add r8, r8, r3, lsl #4
bne _neon_transpose_row
sub r5, r5, #2
cmp r5, #0
add r0, r0, r2, lsl #3
add r1, r1, #16
bne _neon_transpose_col
@ vpop {q8-q9}
pop {r4-r8}
bx lr
.align 4
#ifdef __APPLE__
.globl _neon_transpose_to_buf
_neon_transpose_to_buf:
#else
.globl neon_transpose_to_buf
neon_transpose_to_buf:
#endif
push {r4-r10}
mov r5, #8
_neon_transpose_to_buf_col:
mov r4, #8
add r6, r0, r2, lsl #3
mov r7, r1
add r8, r1, #64
add r9, r1, #128
add r10, r1, #192
_neon_transpose_to_buf_row:
vld1.32 {q8,q9}, [r0, :128]!
vld1.32 {q12,q13}, [r6, :128]!
sub r4, r4, #4
cmp r4, #0
vswp d17,d24
vswp d19,d26
vst1.32 {q8}, [r7, :128]
vst1.32 {q12}, [r8, :128]
vst1.32 {q9}, [r9, :128]
vst1.32 {q13}, [r10, :128]
add r7, r7, #256
add r8, r8, #256
add r9, r9, #256
add r10, r10, #256
bne _neon_transpose_to_buf_row
sub r5, r5, #2
cmp r5, #0
sub r0, r0, #64
add r0, r0, r2, lsl #4
add r1, r1, #16
bne _neon_transpose_to_buf_col
pop {r4-r10}
bx lr

1127
3rdparty/ffts/ffts-master/src/neon_float.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,956 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.align 4
#ifdef __APPLE__
.globl _neon_static_e_f
_neon_static_e_f:
#else
.globl neon_static_e_f
neon_static_e_f:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
ldr lr, [r0, #40] @ this is p->N
add r3, r1, #0
add r7, r1, lr
add r5, r7, lr
add r10, r5, lr
add r4, r10, lr
add r8, r4, lr
add r6, r8, lr
add r9, r6, lr
ldr r12, [r0]
add r1, r0, #0
add r0, r2, #0
ldr r2, [r1, #16] @ this is p->ee_ws
ldr r11, [r1, #28] @ this is p->i0
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_loop:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vsub.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vadd.f32 d31, d5, d2 @
vadd.f32 d28, d4, d3 @
vsub.f32 d30, d4, d3 @
vsub.f32 d5, d19, d14 @
vsub.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vadd.f32 d6, d30, d27 @
vadd.f32 d4, d18, d15 @
vadd.f32 d13, d19, d14 @
vsub.f32 d12, d18, d15 @
vadd.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vsub.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_loop
ldr r11, [r1, #12]
vld2.32 {q9}, [r5, :128]! @tag2
vld2.32 {q13}, [r3, :128]! @tag0
vld2.32 {q12}, [r4, :128]! @tag1
vld2.32 {q0}, [r7, :128]! @tag4
vsub.f32 q11, q13, q12
vld2.32 {q8}, [r6, :128]! @tag3
vadd.f32 q12, q13, q12
vsub.f32 q10, q9, q8
vadd.f32 q8, q9, q8
vadd.f32 q9, q12, q8
vsub.f32 d9, d23, d20 @
vadd.f32 d11, d23, d20 @
vsub.f32 q8, q12, q8
vadd.f32 d8, d22, d21 @
vsub.f32 d10, d22, d21 @
ldr r2, [r12], #4
vld1.32 {d20, d21}, [r11, :128]
ldr lr, [r12], #4
vtrn.32 q9, q4
add r2, r0, r2, lsl #2
vtrn.32 q8, q5
add lr, r0, lr, lsl #2
vswp d9,d10
vst1.32 {d8,d9,d10,d11}, [lr, :128]!
vld2.32 {q13}, [r10, :128]! @tag7
vld2.32 {q15}, [r9, :128]! @tag6
vld2.32 {q11}, [r8, :128]! @tag5
vsub.f32 q14, q15, q13
vsub.f32 q12, q0, q11
vadd.f32 q11, q0, q11
vadd.f32 q13, q15, q13
vsub.f32 d13, d29, d24 @
vadd.f32 q15, q13, q11
vadd.f32 d12, d28, d25 @
vadd.f32 d15, d29, d24 @
vsub.f32 d14, d28, d25 @
vtrn.32 q15, q6
vsub.f32 q15, q13, q11
vtrn.32 q15, q7
vswp d13, d14
vst1.32 {d12,d13,d14,d15}, [lr, :128]!
vtrn.32 q13, q14
vtrn.32 q11, q12
vmul.f32 d24, d26, d21
vmul.f32 d28, d27, d20
vmul.f32 d25, d26, d20
vmul.f32 d26, d27, d21
vmul.f32 d27, d22, d21
vmul.f32 d30, d23, d20
vmul.f32 d29, d23, d21
vmul.f32 d22, d22, d20
vsub.f32 d21, d28, d24
vadd.f32 d20, d26, d25
vadd.f32 d25, d30, d27
vsub.f32 d24, d22, d29
vadd.f32 q11, q12, q10
vsub.f32 q10, q12, q10
vadd.f32 q0, q9, q11
vsub.f32 q2, q9, q11
vsub.f32 d3, d17, d20 @
vadd.f32 d7, d17, d20 @
vadd.f32 d2, d16, d21 @
vsub.f32 d6, d16, d21 @
vswp d1, d2
vswp d5, d6
vstmia r2!, {q0-q3}
add r2, r7, #0
add r7, r9, #0
add r9, r2, #0
add r2, r8, #0
add r8, r10, #0
add r10, r2, #0
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_oo_loop_exit
_neon_oo_loop:
vld2.32 {q8}, [r6, :128]!
vld2.32 {q9}, [r5, :128]!
vld2.32 {q10}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vadd.f32 q11, q9, q8
vsub.f32 q8, q9, q8
vsub.f32 q9, q13, q10
vadd.f32 q12, q13, q10
subs r11, r11, #1
vld2.32 {q10}, [r7, :128]!
vld2.32 {q13}, [r9, :128]!
vsub.f32 q2, q12, q11
vadd.f32 d7, d19, d16 @
vsub.f32 d3, d19, d16 @
vsub.f32 d6, d18, d17 @
vadd.f32 d2, d18, d17 @
vld2.32 {q9}, [r8, :128]!
vld2.32 {q8}, [r10, :128]!
vadd.f32 q0, q12, q11
vadd.f32 q11, q13, q8
vadd.f32 q12, q10, q9
vsub.f32 q8, q13, q8
vsub.f32 q9, q10, q9
vsub.f32 q6, q12, q11
vadd.f32 q4, q12, q11
vtrn.32 q0, q2
ldr r2, [r12], #4
vadd.f32 d15, d19, d16 @
ldr lr, [r12], #4
vsub.f32 d11, d19, d16 @
vsub.f32 d14, d18, d17 @
vadd.f32 d10, d18, d17 @
add r2, r0, r2, lsl #2
vtrn.32 q1, q3
add lr, r0, lr, lsl #2
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_oo_loop
_neon_oo_loop_exit:
add r2, r3, #0
add r3, r7, #0
add r7, r2, #0
add r2, r4, #0
add r4, r8, #0
add r8, r2, #0
add r2, r5, #0
add r5, r9, #0
add r9, r2, #0
add r2, r6, #0
add r6, r10, #0
add r10, r2, #0
add r2, r9, #0
add r9, r10, #0
add r10, r2, #0
ldr r2, [r1, #16]
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_ee_loop2_exit
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_loop2:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vsub.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vadd.f32 d31, d5, d2 @
vadd.f32 d28, d4, d3 @
vsub.f32 d30, d4, d3 @
vsub.f32 d5, d19, d14 @
vsub.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vadd.f32 d6, d30, d27 @
vadd.f32 d4, d18, d15 @
vadd.f32 d13, d19, d14 @
vsub.f32 d12, d18, d15 @
vadd.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vsub.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_loop2
_neon_ee_loop2_exit:
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_o_f
_neon_static_o_f:
#else
.globl neon_static_o_f
neon_static_o_f:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
ldr lr, [r0, #40] @ this is p->N
add r3, r1, #0
add r7, r1, lr
add r5, r7, lr
add r10, r5, lr
add r4, r10, lr
add r8, r4, lr
add r6, r8, lr
add r9, r6, lr
ldr r12, [r0]
add r1, r0, #0
add r0, r2, #0
ldr r2, [r1, #16] @ this is p->ee_ws
ldr r11, [r1, #28] @ this is p->i0
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_o_loop:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vsub.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vadd.f32 d31, d5, d2 @
vadd.f32 d28, d4, d3 @
vsub.f32 d30, d4, d3 @
vsub.f32 d5, d19, d14 @
vsub.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vadd.f32 d6, d30, d27 @
vadd.f32 d4, d18, d15 @
vadd.f32 d13, d19, d14 @
vsub.f32 d12, d18, d15 @
vadd.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vsub.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_o_loop
add r2, r7, #0
add r7, r9, #0
add r9, r2, #0
add r2, r8, #0
add r8, r10, #0
add r10, r2, #0
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_oo_o_loop_exit
_neon_oo_o_loop:
vld2.32 {q8}, [r6, :128]!
vld2.32 {q9}, [r5, :128]!
vld2.32 {q10}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vadd.f32 q11, q9, q8
vsub.f32 q8, q9, q8
vsub.f32 q9, q13, q10
vadd.f32 q12, q13, q10
subs r11, r11, #1
vld2.32 {q10}, [r7, :128]!
vld2.32 {q13}, [r9, :128]!
vsub.f32 q2, q12, q11
vadd.f32 d7, d19, d16 @
vsub.f32 d3, d19, d16 @
vsub.f32 d6, d18, d17 @
vadd.f32 d2, d18, d17 @
vld2.32 {q9}, [r8, :128]!
vld2.32 {q8}, [r10, :128]!
vadd.f32 q0, q12, q11
vadd.f32 q11, q13, q8
vadd.f32 q12, q10, q9
vsub.f32 q8, q13, q8
vsub.f32 q9, q10, q9
vsub.f32 q6, q12, q11
vadd.f32 q4, q12, q11
vtrn.32 q0, q2
ldr r2, [r12], #4
vadd.f32 d15, d19, d16 @
ldr lr, [r12], #4
vsub.f32 d11, d19, d16 @
vsub.f32 d14, d18, d17 @
vadd.f32 d10, d18, d17 @
add r2, r0, r2, lsl #2
vtrn.32 q1, q3
add lr, r0, lr, lsl #2
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_oo_o_loop
_neon_oo_o_loop_exit:
ldr r11, [r1, #8]
vld1.32 {q8}, [r5, :128]!
vld1.32 {q10}, [r6, :128]!
vld2.32 {q11}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vld2.32 {q15}, [r10, :128]!
vorr d25, d17, d17
vorr d24, d20, d20
vorr d20, d16, d16
vsub.f32 q9, q13, q11
vadd.f32 q11, q13, q11
ldr r2, [r12], #4
vtrn.32 d24, d25
ldr lr, [r12], #4
vtrn.32 d20, d21
add r2, r0, r2, lsl #2
vsub.f32 q8, q10, q12
add lr, r0, lr, lsl #2
vadd.f32 q10, q10, q12
vadd.f32 q0, q11, q10
vsub.f32 d25, d19, d16 @
vadd.f32 d27, d19, d16 @
vsub.f32 q1, q11, q10
vadd.f32 d24, d18, d17 @
vsub.f32 d26, d18, d17 @
vtrn.32 q0, q12
vtrn.32 q1, q13
vld1.32 {d24, d25}, [r11, :128]
vswp d1, d2
vst1.32 {q0, q1}, [r2, :128]!
vld2.32 {q0}, [r9, :128]!
vadd.f32 q1, q0, q15
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vsub.f32 q15, q0, q15
vsub.f32 q0, q14, q13
vadd.f32 q3, q14, q13
vadd.f32 q2, q3, q1
vsub.f32 d29, d1, d30 @
vadd.f32 d27, d1, d30 @
vsub.f32 q3, q3, q1
vadd.f32 d28, d0, d31 @
vsub.f32 d26, d0, d31 @
vtrn.32 q2, q14
vtrn.32 q3, q13
vswp d5, d6
vst1.32 {q2, q3}, [r2, :128]!
vtrn.32 q11, q9
vtrn.32 q10, q8
vmul.f32 d20, d18, d25
vmul.f32 d22, d19, d24
vmul.f32 d21, d19, d25
vmul.f32 d18, d18, d24
vmul.f32 d19, d16, d25
vmul.f32 d30, d17, d24
vmul.f32 d23, d16, d24
vmul.f32 d24, d17, d25
vadd.f32 d17, d22, d20
vsub.f32 d16, d18, d21
vsub.f32 d21, d30, d19
vadd.f32 d20, d24, d23
vadd.f32 q9, q8, q10
vsub.f32 q8, q8, q10
vadd.f32 q4, q14, q9
vsub.f32 q6, q14, q9
vsub.f32 d11, d27, d16 @
vadd.f32 d15, d27, d16 @
vadd.f32 d10, d26, d17 @
vsub.f32 d14, d26, d17 @
vswp d9, d10
vswp d13, d14
vstmia lr!, {q4-q7}
add r2, r3, #0
add r3, r7, #0
add r7, r2, #0
add r2, r4, #0
add r4, r8, #0
add r8, r2, #0
add r2, r5, #0
add r5, r9, #0
add r9, r2, #0
add r2, r6, #0
add r6, r10, #0
add r10, r2, #0
add r2, r9, #0
add r9, r10, #0
add r10, r2, #0
ldr r2, [r1, #16]
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_ee_o_loop2_exit
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_o_loop2:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vsub.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vadd.f32 d31, d5, d2 @
vadd.f32 d28, d4, d3 @
vsub.f32 d30, d4, d3 @
vsub.f32 d5, d19, d14 @
vsub.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vadd.f32 d6, d30, d27 @
vadd.f32 d4, d18, d15 @
vadd.f32 d13, d19, d14 @
vsub.f32 d12, d18, d15 @
vadd.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vsub.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_o_loop2
_neon_ee_o_loop2_exit:
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_x4_f
_neon_static_x4_f:
#else
.globl neon_static_x4_f
neon_static_x4_f:
#endif
@ add r3, r0, #0
push {r4, r5, r6, lr}
vstmdb sp!, {d8-d15}
vld1.32 {q8,q9}, [r0, :128]
add r4, r0, r1, lsl #1
vld1.32 {q10,q11}, [r4, :128]
add r5, r0, r1, lsl #2
vld1.32 {q12,q13}, [r5, :128]
add r6, r4, r1, lsl #2
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q2,q3}, [r2, :128]
vmul.f32 q0, q13, q3
vmul.f32 q5, q12, q2
vmul.f32 q1, q14, q2
vmul.f32 q4, q14, q3
vmul.f32 q14, q12, q3
vmul.f32 q13, q13, q2
vmul.f32 q12, q15, q3
vmul.f32 q2, q15, q2
vsub.f32 q0, q5, q0
vadd.f32 q13, q13, q14
vadd.f32 q12, q12, q1
vsub.f32 q1, q2, q4
vadd.f32 q15, q0, q12
vsub.f32 q12, q0, q12
vadd.f32 q14, q13, q1
vsub.f32 q13, q13, q1
vadd.f32 q0, q8, q15
vadd.f32 q1, q9, q14
vadd.f32 q2, q10, q13 @
vsub.f32 q4, q8, q15
vsub.f32 q3, q11, q12 @
vst1.32 {q0,q1}, [r0, :128]
vsub.f32 q5, q9, q14
vsub.f32 q6, q10, q13 @
vadd.f32 q7, q11, q12 @
vst1.32 {q2,q3}, [r4, :128]
vst1.32 {q4,q5}, [r5, :128]
vst1.32 {q6,q7}, [r6, :128]
vldmia sp!, {d8-d15}
pop {r4, r5, r6, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_x8_f
_neon_static_x8_f:
#else
.globl neon_static_x8_f
neon_static_x8_f:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
neon_x8_loop:
vld1.32 {q2,q3}, [r12, :128]!
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q10,q11}, [r5, :128]
adds r11, r11, #1
vmul.f32 q12, q15, q2
vmul.f32 q8, q14, q3
vmul.f32 q13, q14, q2
vmul.f32 q9, q10, q3
vmul.f32 q1, q10, q2
vmul.f32 q0, q11, q2
vmul.f32 q14, q11, q3
vmul.f32 q15, q15, q3
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q10, q12, q8
vadd.f32 q11, q0, q9
vadd.f32 q8, q15, q13
vld1.32 {q12,q13}, [r4, :128]
vsub.f32 q9, q1, q14
vsub.f32 q15, q11, q10
vsub.f32 q14, q9, q8
vadd.f32 q4, q12, q15 @
vsub.f32 q6, q12, q15 @
vsub.f32 q5, q13, q14 @
vadd.f32 q7, q13, q14 @
vld1.32 {q14,q15}, [r9, :128]
vld1.32 {q12,q13}, [r7, :128]
vmul.f32 q1, q14, q2
vmul.f32 q0, q14, q3
vst1.32 {q4,q5}, [r4, :128]
vmul.f32 q14, q15, q3
vmul.f32 q4, q15, q2
vadd.f32 q15, q9, q8
vst1.32 {q6,q7}, [r6, :128]
vmul.f32 q8, q12, q3
vmul.f32 q5, q13, q3
vmul.f32 q12, q12, q2
vmul.f32 q9, q13, q2
vadd.f32 q14, q14, q1
vsub.f32 q13, q4, q0
vadd.f32 q0, q9, q8
vld1.32 {q8,q9}, [r3, :128]
vadd.f32 q1, q11, q10
vsub.f32 q12, q12, q5
vadd.f32 q11, q8, q15
vsub.f32 q8, q8, q15
vadd.f32 q2, q12, q14
vsub.f32 q10, q0, q13
vadd.f32 q15, q0, q13
vadd.f32 q13, q9, q1
vsub.f32 q9, q9, q1
vsub.f32 q12, q12, q14
vadd.f32 q0, q11, q2
vadd.f32 q1, q13, q15
vsub.f32 q4, q11, q2
vadd.f32 q2, q8, q10 @
vsub.f32 q3, q9, q12 @
vst1.32 {q0,q1}, [r3, :128]!
vsub.f32 q5, q13, q15
vld1.32 {q14,q15}, [r10, :128]
vadd.f32 q7, q9, q12 @
vld1.32 {q12,q13}, [r8, :128]
vst1.32 {q2,q3}, [r5, :128]!
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q6, q8, q10 @
vmul.f32 q8, q14, q2
vst1.32 {q4,q5}, [r7, :128]!
vmul.f32 q10, q15, q3
vmul.f32 q9, q13, q3
vmul.f32 q11, q12, q2
vmul.f32 q14, q14, q3
vst1.32 {q6,q7}, [r9, :128]!
vmul.f32 q15, q15, q2
vmul.f32 q12, q12, q3
vmul.f32 q13, q13, q2
vadd.f32 q10, q10, q8
vsub.f32 q11, q11, q9
vld1.32 {q8,q9}, [r4, :128]
vsub.f32 q14, q15, q14
vadd.f32 q15, q13, q12
vadd.f32 q13, q11, q10
vadd.f32 q12, q15, q14
vsub.f32 q15, q15, q14
vsub.f32 q14, q11, q10
vld1.32 {q10,q11}, [r6, :128]
vadd.f32 q0, q8, q13
vadd.f32 q1, q9, q12
vadd.f32 q2, q10, q15 @
vsub.f32 q3, q11, q14 @
vsub.f32 q4, q8, q13
vst1.32 {q0,q1}, [r4, :128]!
vsub.f32 q5, q9, q12
vsub.f32 q6, q10, q15 @
vst1.32 {q2,q3}, [r6, :128]!
vadd.f32 q7, q11, q14 @
vst1.32 {q4,q5}, [r8, :128]!
vst1.32 {q6,q7}, [r10, :128]!
bne neon_x8_loop
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_x8_t_f
_neon_static_x8_t_f:
#else
.globl neon_static_x8_t_f
neon_static_x8_t_f:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
neon_x8_t_loop:
vld1.32 {q2,q3}, [r12, :128]!
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q10,q11}, [r5, :128]
adds r11, r11, #1
vmul.f32 q12, q15, q2
vmul.f32 q8, q14, q3
vmul.f32 q13, q14, q2
vmul.f32 q9, q10, q3
vmul.f32 q1, q10, q2
vmul.f32 q0, q11, q2
vmul.f32 q14, q11, q3
vmul.f32 q15, q15, q3
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q10, q12, q8
vadd.f32 q11, q0, q9
vadd.f32 q8, q15, q13
vld1.32 {q12,q13}, [r4, :128]
vsub.f32 q9, q1, q14
vsub.f32 q15, q11, q10
vsub.f32 q14, q9, q8
vadd.f32 q4, q12, q15 @
vsub.f32 q6, q12, q15 @
vsub.f32 q5, q13, q14 @
vadd.f32 q7, q13, q14 @
vld1.32 {q14,q15}, [r9, :128]
vld1.32 {q12,q13}, [r7, :128]
vmul.f32 q1, q14, q2
vmul.f32 q0, q14, q3
vst1.32 {q4,q5}, [r4, :128]
vmul.f32 q14, q15, q3
vmul.f32 q4, q15, q2
vadd.f32 q15, q9, q8
vst1.32 {q6,q7}, [r6, :128]
vmul.f32 q8, q12, q3
vmul.f32 q5, q13, q3
vmul.f32 q12, q12, q2
vmul.f32 q9, q13, q2
vadd.f32 q14, q14, q1
vsub.f32 q13, q4, q0
vadd.f32 q0, q9, q8
vld1.32 {q8,q9}, [r3, :128]
vadd.f32 q1, q11, q10
vsub.f32 q12, q12, q5
vadd.f32 q11, q8, q15
vsub.f32 q8, q8, q15
vadd.f32 q2, q12, q14
vsub.f32 q10, q0, q13
vadd.f32 q15, q0, q13
vadd.f32 q13, q9, q1
vsub.f32 q9, q9, q1
vsub.f32 q12, q12, q14
vadd.f32 q0, q11, q2
vadd.f32 q1, q13, q15
vsub.f32 q4, q11, q2
vadd.f32 q2, q8, q10 @
vsub.f32 q3, q9, q12 @
vst2.32 {q0,q1}, [r3, :128]!
vsub.f32 q5, q13, q15
vld1.32 {q14,q15}, [r10, :128]
vadd.f32 q7, q9, q12 @
vld1.32 {q12,q13}, [r8, :128]
vst2.32 {q2,q3}, [r5, :128]!
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q6, q8, q10 @
vmul.f32 q8, q14, q2
vst2.32 {q4,q5}, [r7, :128]!
vmul.f32 q10, q15, q3
vmul.f32 q9, q13, q3
vmul.f32 q11, q12, q2
vmul.f32 q14, q14, q3
vst2.32 {q6,q7}, [r9, :128]!
vmul.f32 q15, q15, q2
vmul.f32 q12, q12, q3
vmul.f32 q13, q13, q2
vadd.f32 q10, q10, q8
vsub.f32 q11, q11, q9
vld1.32 {q8,q9}, [r4, :128]
vsub.f32 q14, q15, q14
vadd.f32 q15, q13, q12
vadd.f32 q13, q11, q10
vadd.f32 q12, q15, q14
vsub.f32 q15, q15, q14
vsub.f32 q14, q11, q10
vld1.32 {q10,q11}, [r6, :128]
vadd.f32 q0, q8, q13
vadd.f32 q1, q9, q12
vadd.f32 q2, q10, q15 @
vsub.f32 q3, q11, q14 @
vsub.f32 q4, q8, q13
vst2.32 {q0,q1}, [r4, :128]!
vsub.f32 q5, q9, q12
vsub.f32 q6, q10, q15 @
vst2.32 {q2,q3}, [r6, :128]!
vadd.f32 q7, q11, q14 @
vst2.32 {q4,q5}, [r8, :128]!
vst2.32 {q6,q7}, [r10, :128]!
bne neon_x8_t_loop
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}

View File

@@ -0,0 +1,955 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.align 4
#ifdef __APPLE__
.globl _neon_static_e_i
_neon_static_e_i:
#else
.globl neon_static_e_i
neon_static_e_i:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
ldr lr, [r0, #40] @ this is p->N
add r3, r1, #0
add r7, r1, lr
add r5, r7, lr
add r10, r5, lr
add r4, r10, lr
add r8, r4, lr
add r6, r8, lr
add r9, r6, lr
ldr r12, [r0]
add r1, r0, #0
add r0, r2, #0
ldr r2, [r1, #16] @ this is p->ee_ws
ldr r11, [r1, #28] @ this is p->i0
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_loop:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vadd.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vsub.f32 d31, d5, d2 @
vsub.f32 d28, d4, d3 @
vadd.f32 d30, d4, d3 @
vadd.f32 d5, d19, d14 @
vadd.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vsub.f32 d6, d30, d27 @
vsub.f32 d4, d18, d15 @
vsub.f32 d13, d19, d14 @
vadd.f32 d12, d18, d15 @
vsub.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vadd.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_loop
ldr r11, [r1, #12]
vld2.32 {q9}, [r5, :128]! @tag2
vld2.32 {q13}, [r3, :128]! @tag0
vld2.32 {q12}, [r4, :128]! @tag1
vld2.32 {q0}, [r7, :128]! @tag4
vsub.f32 q11, q13, q12
vld2.32 {q8}, [r6, :128]! @tag3
vadd.f32 q12, q13, q12
vsub.f32 q10, q9, q8
vadd.f32 q8, q9, q8
vadd.f32 q9, q12, q8
vadd.f32 d9, d23, d20 @
vsub.f32 d11, d23, d20 @
vsub.f32 q8, q12, q8
vsub.f32 d8, d22, d21 @
vadd.f32 d10, d22, d21 @
ldr r2, [r12], #4
vld1.32 {d20, d21}, [r11, :128]
ldr lr, [r12], #4
vtrn.32 q9, q4
add r2, r0, r2, lsl #2
vtrn.32 q8, q5
add lr, r0, lr, lsl #2
vswp d9,d10
vst1.32 {d8,d9,d10,d11}, [lr, :128]!
vld2.32 {q13}, [r10, :128]! @tag7
vld2.32 {q15}, [r9, :128]! @tag6
vld2.32 {q11}, [r8, :128]! @tag5
vsub.f32 q14, q15, q13
vsub.f32 q12, q0, q11
vadd.f32 q11, q0, q11
vadd.f32 q13, q15, q13
vadd.f32 d13, d29, d24 @
vadd.f32 q15, q13, q11
vsub.f32 d12, d28, d25 @
vsub.f32 d15, d29, d24 @
vadd.f32 d14, d28, d25 @
vtrn.32 q15, q6
vsub.f32 q15, q13, q11
vtrn.32 q15, q7
vswp d13, d14
vst1.32 {d12,d13,d14,d15}, [lr, :128]!
vtrn.32 q13, q14
vtrn.32 q11, q12
vmul.f32 d24, d26, d21
vmul.f32 d28, d27, d20
vmul.f32 d25, d26, d20
vmul.f32 d26, d27, d21
vmul.f32 d27, d22, d21
vmul.f32 d30, d23, d20
vmul.f32 d29, d23, d21
vmul.f32 d22, d22, d20
vsub.f32 d21, d28, d24
vadd.f32 d20, d26, d25
vadd.f32 d25, d30, d27
vsub.f32 d24, d22, d29
vadd.f32 q11, q12, q10
vsub.f32 q10, q12, q10
vadd.f32 q0, q9, q11
vsub.f32 q2, q9, q11
vadd.f32 d3, d17, d20 @
vsub.f32 d7, d17, d20 @
vsub.f32 d2, d16, d21 @
vadd.f32 d6, d16, d21 @
vswp d1, d2
vswp d5, d6
vstmia r2!, {q0-q3}
add r2, r7, #0
add r7, r9, #0
add r9, r2, #0
add r2, r8, #0
add r8, r10, #0
add r10, r2, #0
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_oo_loop_exit
_neon_oo_loop:
vld2.32 {q8}, [r6, :128]!
vld2.32 {q9}, [r5, :128]!
vld2.32 {q10}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vadd.f32 q11, q9, q8
vsub.f32 q8, q9, q8
vsub.f32 q9, q13, q10
vadd.f32 q12, q13, q10
subs r11, r11, #1
vld2.32 {q10}, [r7, :128]!
vld2.32 {q13}, [r9, :128]!
vsub.f32 q2, q12, q11
vsub.f32 d7, d19, d16 @
vadd.f32 d3, d19, d16 @
vadd.f32 d6, d18, d17 @
vsub.f32 d2, d18, d17 @
vld2.32 {q9}, [r8, :128]!
vld2.32 {q8}, [r10, :128]!
vadd.f32 q0, q12, q11
vadd.f32 q11, q13, q8
vadd.f32 q12, q10, q9
vsub.f32 q8, q13, q8
vsub.f32 q9, q10, q9
vsub.f32 q6, q12, q11
vadd.f32 q4, q12, q11
vtrn.32 q0, q2
ldr r2, [r12], #4
vsub.f32 d15, d19, d16 @
ldr lr, [r12], #4
vadd.f32 d11, d19, d16 @
vadd.f32 d14, d18, d17 @
vsub.f32 d10, d18, d17 @
add r2, r0, r2, lsl #2
vtrn.32 q1, q3
add lr, r0, lr, lsl #2
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_oo_loop
_neon_oo_loop_exit:
add r2, r3, #0
add r3, r7, #0
add r7, r2, #0
add r2, r4, #0
add r4, r8, #0
add r8, r2, #0
add r2, r5, #0
add r5, r9, #0
add r9, r2, #0
add r2, r6, #0
add r6, r10, #0
add r10, r2, #0
add r2, r9, #0
add r9, r10, #0
add r10, r2, #0
ldr r2, [r1, #16]
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_ee_loop2_exit
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_loop2:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vadd.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vsub.f32 d31, d5, d2 @
vsub.f32 d28, d4, d3 @
vadd.f32 d30, d4, d3 @
vadd.f32 d5, d19, d14 @
vadd.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vsub.f32 d6, d30, d27 @
vsub.f32 d4, d18, d15 @
vsub.f32 d13, d19, d14 @
vadd.f32 d12, d18, d15 @
vsub.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vadd.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_loop2
_neon_ee_loop2_exit:
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_o_i
_neon_static_o_i:
#else
.globl neon_static_o_i
neon_static_o_i:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
ldr lr, [r0, #40] @ this is p->N
add r3, r1, #0
add r7, r1, lr
add r5, r7, lr
add r10, r5, lr
add r4, r10, lr
add r8, r4, lr
add r6, r8, lr
add r9, r6, lr
ldr r12, [r0]
add r1, r0, #0
add r0, r2, #0
ldr r2, [r1, #16] @ this is p->ee_ws
ldr r11, [r1, #28] @ this is p->i0
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_o_loop:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vadd.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vsub.f32 d31, d5, d2 @
vsub.f32 d28, d4, d3 @
vadd.f32 d30, d4, d3 @
vadd.f32 d5, d19, d14 @
vadd.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vsub.f32 d6, d30, d27 @
vsub.f32 d4, d18, d15 @
vsub.f32 d13, d19, d14 @
vadd.f32 d12, d18, d15 @
vsub.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vadd.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_o_loop
add r2, r7, #0
add r7, r9, #0
add r9, r2, #0
add r2, r8, #0
add r8, r10, #0
add r10, r2, #0
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_oo_o_loop_exit
_neon_oo_o_loop:
vld2.32 {q8}, [r6, :128]!
vld2.32 {q9}, [r5, :128]!
vld2.32 {q10}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vadd.f32 q11, q9, q8
vsub.f32 q8, q9, q8
vsub.f32 q9, q13, q10
vadd.f32 q12, q13, q10
subs r11, r11, #1
vld2.32 {q10}, [r7, :128]!
vld2.32 {q13}, [r9, :128]!
vsub.f32 q2, q12, q11
vsub.f32 d7, d19, d16 @
vadd.f32 d3, d19, d16 @
vadd.f32 d6, d18, d17 @
vsub.f32 d2, d18, d17 @
vld2.32 {q9}, [r8, :128]!
vld2.32 {q8}, [r10, :128]!
vadd.f32 q0, q12, q11
vadd.f32 q11, q13, q8
vadd.f32 q12, q10, q9
vsub.f32 q8, q13, q8
vsub.f32 q9, q10, q9
vsub.f32 q6, q12, q11
vadd.f32 q4, q12, q11
vtrn.32 q0, q2
ldr r2, [r12], #4
vsub.f32 d15, d19, d16 @
ldr lr, [r12], #4
vadd.f32 d11, d19, d16 @
vadd.f32 d14, d18, d17 @
vsub.f32 d10, d18, d17 @
add r2, r0, r2, lsl #2
vtrn.32 q1, q3
add lr, r0, lr, lsl #2
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_oo_o_loop
_neon_oo_o_loop_exit:
ldr r11, [r1, #8]
vld1.32 {q8}, [r5, :128]!
vld1.32 {q10}, [r6, :128]!
vld2.32 {q11}, [r4, :128]!
vld2.32 {q13}, [r3, :128]!
vld2.32 {q15}, [r10, :128]!
vorr d25, d17, d17
vorr d24, d20, d20
vorr d20, d16, d16
vsub.f32 q9, q13, q11
vadd.f32 q11, q13, q11
ldr r2, [r12], #4
vtrn.32 d24, d25
ldr lr, [r12], #4
vtrn.32 d20, d21
add r2, r0, r2, lsl #2
vsub.f32 q8, q10, q12
add lr, r0, lr, lsl #2
vadd.f32 q10, q10, q12
vadd.f32 q0, q11, q10
vadd.f32 d25, d19, d16 @
vsub.f32 d27, d19, d16 @
vsub.f32 q1, q11, q10
vsub.f32 d24, d18, d17 @
vadd.f32 d26, d18, d17 @
vtrn.32 q0, q12
vtrn.32 q1, q13
vld1.32 {d24, d25}, [r11, :128]
vswp d1, d2
vst1.32 {q0, q1}, [r2, :128]!
vld2.32 {q0}, [r9, :128]!
vadd.f32 q1, q0, q15
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vsub.f32 q15, q0, q15
vsub.f32 q0, q14, q13
vadd.f32 q3, q14, q13
vadd.f32 q2, q3, q1
vadd.f32 d29, d1, d30 @
vsub.f32 d27, d1, d30 @
vsub.f32 q3, q3, q1
vsub.f32 d28, d0, d31 @
vadd.f32 d26, d0, d31 @
vtrn.32 q2, q14
vtrn.32 q3, q13
vswp d5, d6
vst1.32 {q2, q3}, [r2, :128]!
vtrn.32 q11, q9
vtrn.32 q10, q8
vmul.f32 d20, d18, d25
vmul.f32 d22, d19, d24
vmul.f32 d21, d19, d25
vmul.f32 d18, d18, d24
vmul.f32 d19, d16, d25
vmul.f32 d30, d17, d24
vmul.f32 d23, d16, d24
vmul.f32 d24, d17, d25
vadd.f32 d17, d22, d20
vsub.f32 d16, d18, d21
vsub.f32 d21, d30, d19
vadd.f32 d20, d24, d23
vadd.f32 q9, q8, q10
vsub.f32 q8, q8, q10
vadd.f32 q4, q14, q9
vsub.f32 q6, q14, q9
vadd.f32 d11, d27, d16 @
vsub.f32 d15, d27, d16 @
vsub.f32 d10, d26, d17 @
vadd.f32 d14, d26, d17 @
vswp d9, d10
vswp d13, d14
vstmia lr!, {q4-q7}
add r2, r3, #0
add r3, r7, #0
add r7, r2, #0
add r2, r4, #0
add r4, r8, #0
add r8, r2, #0
add r2, r5, #0
add r5, r9, #0
add r9, r2, #0
add r2, r6, #0
add r6, r10, #0
add r10, r2, #0
add r2, r9, #0
add r9, r10, #0
add r10, r2, #0
ldr r2, [r1, #16]
ldr r11, [r1, #32] @ this is p->i1
cmp r11, #0
beq _neon_ee_o_loop2_exit
vld1.32 {d16, d17}, [r2, :128]
_neon_ee_o_loop2:
vld2.32 {q15}, [r10, :128]!
vld2.32 {q13}, [r8, :128]!
vld2.32 {q14}, [r7, :128]!
vld2.32 {q9}, [r4, :128]!
vld2.32 {q10}, [r3, :128]!
vld2.32 {q11}, [r6, :128]!
vld2.32 {q12}, [r5, :128]!
vsub.f32 q1, q14, q13
vld2.32 {q0}, [r9, :128]!
subs r11, r11, #1
vsub.f32 q2, q0, q15
vadd.f32 q0, q0, q15
vmul.f32 d10, d2, d17
vmul.f32 d11, d3, d16
vmul.f32 d12, d3, d17
vmul.f32 d6, d4, d17
vmul.f32 d7, d5, d16
vmul.f32 d8, d4, d16
vmul.f32 d9, d5, d17
vmul.f32 d13, d2, d16
vsub.f32 d7, d7, d6
vadd.f32 d11, d11, d10
vsub.f32 q1, q12, q11
vsub.f32 q2, q10, q9
vadd.f32 d6, d9, d8
vadd.f32 q4, q14, q13
vadd.f32 q11, q12, q11
vadd.f32 q12, q10, q9
vsub.f32 d10, d13, d12
vsub.f32 q7, q4, q0
vsub.f32 q9, q12, q11
vsub.f32 q13, q5, q3
vadd.f32 d29, d5, d2 @
vadd.f32 q5, q5, q3
vadd.f32 q10, q4, q0
vadd.f32 q11, q12, q11
vsub.f32 d31, d5, d2 @
vsub.f32 d28, d4, d3 @
vadd.f32 d30, d4, d3 @
vadd.f32 d5, d19, d14 @
vadd.f32 d7, d31, d26 @
vadd.f32 q1, q14, q5
vadd.f32 q0, q11, q10
vsub.f32 d6, d30, d27 @
vsub.f32 d4, d18, d15 @
vsub.f32 d13, d19, d14 @
vadd.f32 d12, d18, d15 @
vsub.f32 d15, d31, d26 @
ldr r2, [r12], #4
vtrn.32 q1, q3
ldr lr, [r12], #4
vtrn.32 q0, q2
add r2, r0, r2, lsl #2
vsub.f32 q4, q11, q10
add lr, r0, lr, lsl #2
vsub.f32 q5, q14, q5
vadd.f32 d14, d30, d27 @
vst2.32 {q0,q1}, [r2, :128]!
vst2.32 {q2,q3}, [lr, :128]!
vtrn.32 q4, q6
vtrn.32 q5, q7
vst2.32 {q4,q5}, [r2, :128]!
vst2.32 {q6,q7}, [lr, :128]!
bne _neon_ee_o_loop2
_neon_ee_o_loop2_exit:
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_x4_i
_neon_static_x4_i:
#else
.globl neon_static_x4_i
neon_static_x4_i:
#endif
@ add r3, r0, #0
push {r4, r5, r6, lr}
vstmdb sp!, {d8-d15}
vld1.32 {q8,q9}, [r0, :128]
add r4, r0, r1, lsl #1
vld1.32 {q10,q11}, [r4, :128]
add r5, r0, r1, lsl #2
vld1.32 {q12,q13}, [r5, :128]
add r6, r4, r1, lsl #2
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q2,q3}, [r2, :128]
vmul.f32 q0, q13, q3
vmul.f32 q5, q12, q2
vmul.f32 q1, q14, q2
vmul.f32 q4, q14, q3
vmul.f32 q14, q12, q3
vmul.f32 q13, q13, q2
vmul.f32 q12, q15, q3
vmul.f32 q2, q15, q2
vsub.f32 q0, q5, q0
vadd.f32 q13, q13, q14
vadd.f32 q12, q12, q1
vsub.f32 q1, q2, q4
vadd.f32 q15, q0, q12
vsub.f32 q12, q0, q12
vadd.f32 q14, q13, q1
vsub.f32 q13, q13, q1
vadd.f32 q0, q8, q15
vadd.f32 q1, q9, q14
vsub.f32 q2, q10, q13 @
vsub.f32 q4, q8, q15
vadd.f32 q3, q11, q12 @
vst1.32 {q0,q1}, [r0, :128]
vsub.f32 q5, q9, q14
vadd.f32 q6, q10, q13 @
vsub.f32 q7, q11, q12 @
vst1.32 {q2,q3}, [r4, :128]
vst1.32 {q4,q5}, [r5, :128]
vst1.32 {q6,q7}, [r6, :128]
vldmia sp!, {d8-d15}
pop {r4, r5, r6, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_x8_i
_neon_static_x8_i:
#else
.globl neon_static_x8_i
neon_static_x8_i:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
neon_x8_loop:
vld1.32 {q2,q3}, [r12, :128]!
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q10,q11}, [r5, :128]
adds r11, r11, #1
vmul.f32 q12, q15, q2
vmul.f32 q8, q14, q3
vmul.f32 q13, q14, q2
vmul.f32 q9, q10, q3
vmul.f32 q1, q10, q2
vmul.f32 q0, q11, q2
vmul.f32 q14, q11, q3
vmul.f32 q15, q15, q3
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q10, q12, q8
vadd.f32 q11, q0, q9
vadd.f32 q8, q15, q13
vld1.32 {q12,q13}, [r4, :128]
vsub.f32 q9, q1, q14
vsub.f32 q15, q11, q10
vsub.f32 q14, q9, q8
vsub.f32 q4, q12, q15 @
vadd.f32 q6, q12, q15 @
vadd.f32 q5, q13, q14 @
vsub.f32 q7, q13, q14 @
vld1.32 {q14,q15}, [r9, :128]
vld1.32 {q12,q13}, [r7, :128]
vmul.f32 q1, q14, q2
vmul.f32 q0, q14, q3
vst1.32 {q4,q5}, [r4, :128]
vmul.f32 q14, q15, q3
vmul.f32 q4, q15, q2
vadd.f32 q15, q9, q8
vst1.32 {q6,q7}, [r6, :128]
vmul.f32 q8, q12, q3
vmul.f32 q5, q13, q3
vmul.f32 q12, q12, q2
vmul.f32 q9, q13, q2
vadd.f32 q14, q14, q1
vsub.f32 q13, q4, q0
vadd.f32 q0, q9, q8
vld1.32 {q8,q9}, [r3, :128]
vadd.f32 q1, q11, q10
vsub.f32 q12, q12, q5
vadd.f32 q11, q8, q15
vsub.f32 q8, q8, q15
vadd.f32 q2, q12, q14
vsub.f32 q10, q0, q13
vadd.f32 q15, q0, q13
vadd.f32 q13, q9, q1
vsub.f32 q9, q9, q1
vsub.f32 q12, q12, q14
vadd.f32 q0, q11, q2
vadd.f32 q1, q13, q15
vsub.f32 q4, q11, q2
vsub.f32 q2, q8, q10 @
vadd.f32 q3, q9, q12 @
vst1.32 {q0,q1}, [r3, :128]!
vsub.f32 q5, q13, q15
vld1.32 {q14,q15}, [r10, :128]
vsub.f32 q7, q9, q12 @
vld1.32 {q12,q13}, [r8, :128]
vst1.32 {q2,q3}, [r5, :128]!
vld1.32 {q2,q3}, [r12, :128]!
vadd.f32 q6, q8, q10 @
vmul.f32 q8, q14, q2
vst1.32 {q4,q5}, [r7, :128]!
vmul.f32 q10, q15, q3
vmul.f32 q9, q13, q3
vmul.f32 q11, q12, q2
vmul.f32 q14, q14, q3
vst1.32 {q6,q7}, [r9, :128]!
vmul.f32 q15, q15, q2
vmul.f32 q12, q12, q3
vmul.f32 q13, q13, q2
vadd.f32 q10, q10, q8
vsub.f32 q11, q11, q9
vld1.32 {q8,q9}, [r4, :128]
vsub.f32 q14, q15, q14
vadd.f32 q15, q13, q12
vadd.f32 q13, q11, q10
vadd.f32 q12, q15, q14
vsub.f32 q15, q15, q14
vsub.f32 q14, q11, q10
vld1.32 {q10,q11}, [r6, :128]
vadd.f32 q0, q8, q13
vadd.f32 q1, q9, q12
vsub.f32 q2, q10, q15 @
vadd.f32 q3, q11, q14 @
vsub.f32 q4, q8, q13
vst1.32 {q0,q1}, [r4, :128]!
vsub.f32 q5, q9, q12
vadd.f32 q6, q10, q15 @
vst1.32 {q2,q3}, [r6, :128]!
vsub.f32 q7, q11, q14 @
vst1.32 {q4,q5}, [r8, :128]!
vst1.32 {q6,q7}, [r10, :128]!
bne neon_x8_loop
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
.align 4
#ifdef __APPLE__
.globl _neon_static_x8_t_i
_neon_static_x8_t_i:
#else
.globl neon_static_x8_t_i
neon_static_x8_t_i:
#endif
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
vstmdb sp!, {d8-d15}
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #5
neon_x8_t_loop:
vld1.32 {q2,q3}, [r12, :128]!
vld1.32 {q14,q15}, [r6, :128]
vld1.32 {q10,q11}, [r5, :128]
adds r11, r11, #1
vmul.f32 q12, q15, q2
vmul.f32 q8, q14, q3
vmul.f32 q13, q14, q2
vmul.f32 q9, q10, q3
vmul.f32 q1, q10, q2
vmul.f32 q0, q11, q2
vmul.f32 q14, q11, q3
vmul.f32 q15, q15, q3
vld1.32 {q2,q3}, [r12, :128]!
vsub.f32 q10, q12, q8
vadd.f32 q11, q0, q9
vadd.f32 q8, q15, q13
vld1.32 {q12,q13}, [r4, :128]
vsub.f32 q9, q1, q14
vsub.f32 q15, q11, q10
vsub.f32 q14, q9, q8
vsub.f32 q4, q12, q15 @
vadd.f32 q6, q12, q15 @
vadd.f32 q5, q13, q14 @
vsub.f32 q7, q13, q14 @
vld1.32 {q14,q15}, [r9, :128]
vld1.32 {q12,q13}, [r7, :128]
vmul.f32 q1, q14, q2
vmul.f32 q0, q14, q3
vst1.32 {q4,q5}, [r4, :128]
vmul.f32 q14, q15, q3
vmul.f32 q4, q15, q2
vadd.f32 q15, q9, q8
vst1.32 {q6,q7}, [r6, :128]
vmul.f32 q8, q12, q3
vmul.f32 q5, q13, q3
vmul.f32 q12, q12, q2
vmul.f32 q9, q13, q2
vadd.f32 q14, q14, q1
vsub.f32 q13, q4, q0
vadd.f32 q0, q9, q8
vld1.32 {q8,q9}, [r3, :128]
vadd.f32 q1, q11, q10
vsub.f32 q12, q12, q5
vadd.f32 q11, q8, q15
vsub.f32 q8, q8, q15
vadd.f32 q2, q12, q14
vsub.f32 q10, q0, q13
vadd.f32 q15, q0, q13
vadd.f32 q13, q9, q1
vsub.f32 q9, q9, q1
vsub.f32 q12, q12, q14
vadd.f32 q0, q11, q2
vadd.f32 q1, q13, q15
vsub.f32 q4, q11, q2
vsub.f32 q2, q8, q10 @
vadd.f32 q3, q9, q12 @
vst2.32 {q0,q1}, [r3, :128]!
vsub.f32 q5, q13, q15
vld1.32 {q14,q15}, [r10, :128]
vsub.f32 q7, q9, q12 @
vld1.32 {q12,q13}, [r8, :128]
vst2.32 {q2,q3}, [r5, :128]!
vld1.32 {q2,q3}, [r12, :128]!
vadd.f32 q6, q8, q10 @
vmul.f32 q8, q14, q2
vst2.32 {q4,q5}, [r7, :128]!
vmul.f32 q10, q15, q3
vmul.f32 q9, q13, q3
vmul.f32 q11, q12, q2
vmul.f32 q14, q14, q3
vst2.32 {q6,q7}, [r9, :128]!
vmul.f32 q15, q15, q2
vmul.f32 q12, q12, q3
vmul.f32 q13, q13, q2
vadd.f32 q10, q10, q8
vsub.f32 q11, q11, q9
vld1.32 {q8,q9}, [r4, :128]
vsub.f32 q14, q15, q14
vadd.f32 q15, q13, q12
vadd.f32 q13, q11, q10
vadd.f32 q12, q15, q14
vsub.f32 q15, q15, q14
vsub.f32 q14, q11, q10
vld1.32 {q10,q11}, [r6, :128]
vadd.f32 q0, q8, q13
vadd.f32 q1, q9, q12
vsub.f32 q2, q10, q15 @
vadd.f32 q3, q11, q14 @
vsub.f32 q4, q8, q13
vst2.32 {q0,q1}, [r4, :128]!
vsub.f32 q5, q9, q12
vadd.f32 q6, q10, q15 @
vst2.32 {q2,q3}, [r6, :128]!
vsub.f32 q7, q11, q14 @
vst2.32 {q4,q5}, [r8, :128]!
vst2.32 {q6,q7}, [r10, :128]!
bne neon_x8_t_loop
vldmia sp!, {d8-d15}
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}

209
3rdparty/ffts/ffts-master/src/patterns.c vendored Normal file
View File

@@ -0,0 +1,209 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "patterns.h"
void permute_addr(int N, int offset, int stride, int *d) {
int i, a[4] = {0,2,1,3};
for(i=0;i<4;i++) {
d[i] = offset + (a[i] << stride);
if(d[i] < 0) d[i] += N;
}
}
void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
if(N > 4) {
ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);
if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
else {
int temp = poffset+(1<<stride);
if(temp < 0) temp += bigN;
temp *= 2;
if(!(temp % (VL*2))) {
(*is)[0] = poffset+(1<<stride);
(*is)[1] = poffset+(1<<stride)+(1<<(stride+2));
(*is)[2] = poffset-(1<<stride);
(*is)[3] = poffset-(1<<stride)+(1<<(stride+2));
int i;
for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN;
for(i=0;i<4;i++) (*is)[i] *= 2;
*is += 4;
}
}
}else if(N == 4) {
int perm[4];
permute_addr(bigN, poffset, stride, perm);
if(!((perm[0]*2) % (VL*2))) {
int i;
for(i=0;i<4;i++) {
(*is)[i] = perm[i] * 2;
}
*is += 4;
}
}
}
void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) {
int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
int stride = log(N/leafN)/log(2);
p->is = malloc(N/VL * sizeof(ptrdiff_t));
ptrdiff_t *is = p->is;
if((N/leafN) % 3 > 1) i1++;
for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
for(i=i0;i<i0+i1;i++) {
ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
}
for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
//for(i=0;i<N/VL;i++) {
// printf("%td ", p->is[i]);
// if(i % 16 == 15) printf("\n");
//}
p->i0 = i0; p->i1 = i1;
}
/**
*
*
*/
void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
if((even && N == leafN) || (!even && N <= leafN)) {
offsets[2*(ooffset/leafN)] = ioffset*2;
offsets[2*(ooffset/leafN)+1] = ooffset;
}else if(N > 4) {
ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
if(N/4 >= leafN)
ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
}
}
int compare_offsets(const void *a, const void *b) {
return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
}
uint32_t reverse_bits(uint32_t a, int n) {
uint32_t x = 0;
int i;
for(i=0;i<n;i++) {
if(a & (1 << i)) x |= 1 << (n-i-1);
}
return x;
}
void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
size_t i;
for(i=0;i<2*N/leafN;i+=2) {
if(offsets[i] < 0) offsets[i] = N + offsets[i];
}
qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets);
//elaborate_is(p, N, 0, 0, 1);
p->offsets = malloc(N/leafN * sizeof(ptrdiff_t));
for(i=0;i<N/leafN;i++) {
p->offsets[i] = offsets[i*2+1]*2;
}
//for(i=0;i<N/leafN;i++) {
// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
//}
free(offsets);
}
/*
int tree_count(int N, int leafN, int offset) {
if(N <= leafN) return 0;
int count = 0;
count += tree_count(N/4, leafN, offset);
count += tree_count(N/8, leafN, offset + N/4);
count += tree_count(N/8, leafN, offset + N/4 + N/8);
count += tree_count(N/4, leafN, offset + N/2);
count += tree_count(N/4, leafN, offset + 3*N/4);
return 1 + count;
}
void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
if(N <= leafN) return;
elaborate_tree(p, N/4, leafN, offset);
elaborate_tree(p, N/8, leafN, offset + N/4);
elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
elaborate_tree(p, N/4, leafN, offset + N/2);
elaborate_tree(p, N/4, leafN, offset + 3*N/4);
(*p)[0] = N;
(*p)[1] = offset*2;
(*p)+=2;
}
void ffts_init_tree(ffts_plan_t *p, int N, int leafN) {
int count = tree_count(N, leafN, 0) + 1;
transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
//printf("count = %d\n", count);
elaborate_tree(&ps, N, leafN, 0);
#ifdef __ARM_NEON__
ps -= 2;
#endif
ps[0] = 0;
ps[1] = 0;
//int i;
//for(i=0;i<count;i++) {
// fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1],
// __builtin_ctzl(p->transforms[i*2]) - 5);
//}
}
*/
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

View File

@@ -0,0 +1,45 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __PATTERNS_H__
#define __PATTERNS_H__
#include "ffts.h"
void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL);
void ffts_init_offsets(ffts_plan_t *p, int N, int leafN);
//void ffts_init_tree(ffts_plan_t *p, int N, int leafN);
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

878
3rdparty/ffts/ffts-master/src/sse.s vendored Normal file
View File

@@ -0,0 +1,878 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.globl _neon_x4
.align 4
_neon_x4:
.globl _neon_x8
.align 4
_neon_x8:
.globl _neon_x8_t
.align 4
_neon_x8_t:
#ifdef __APPLE__
.globl _leaf_ee_init
_leaf_ee_init:
#else
.globl leaf_ee_init
leaf_ee_init:
#endif
#lea L_sse_constants(%rip), %r9
movq 0xe0(%rdi), %r9
xorl %eax, %eax
# eax is loop counter (init to 0)
# rcx is loop max count
# rsi is 'in' base pointer
# rdx is 'out' base pointer
# r8 is offsets pointer
# r9 is constants pointer
# scratch: rax r11 r12
# .align 4, 0x90
# _leaf_ee + 9 needs 16 byte alignment
#ifdef __APPLE__
.globl _leaf_ee
_leaf_ee:
#else
.globl leaf_ee
leaf_ee:
#endif
movaps 32(%r9), %xmm0 #83.5
movaps (%r9), %xmm8 #83.5
LEAF_EE_1:
LEAF_EE_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
LEAF_EE_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
movaps %xmm7, %xmm6 #83.5
LEAF_EE_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
movaps %xmm12, %xmm11 #83.5
subps %xmm10, %xmm12 #83.5
addps %xmm10, %xmm11 #83.5
xorps %xmm8, %xmm12 #83.5
LEAF_EE_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
LEAF_EE_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
addps %xmm9, %xmm6 #83.5
subps %xmm9, %xmm7 #83.5
LEAF_EE_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
movaps %xmm10, %xmm9 #83.5
LEAF_EE_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
movaps %xmm6, %xmm5 #83.5
LEAF_EE_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
movaps %xmm3, %xmm15 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm7, %xmm4 #83.5
movslq (%r8, %rax, 4), %r11 #83.44
subps %xmm13, %xmm10 #83.5
subps %xmm14, %xmm3 #83.5
addps %xmm11, %xmm5 #83.5
subps %xmm11, %xmm6 #83.5
subps %xmm12, %xmm4 #83.5
addps %xmm12, %xmm7 #83.5
addps %xmm13, %xmm9 #83.5
addps %xmm14, %xmm15 #83.5
movaps 16(%r9), %xmm12 #83.5
movaps %xmm9, %xmm1 #83.5
movaps 16(%r9), %xmm11 #83.5
movaps %xmm5, %xmm2 #83.5
mulps %xmm10, %xmm12 #83.5
subps %xmm15, %xmm9 #83.5
addps %xmm15, %xmm1 #83.5
mulps %xmm3, %xmm11 #83.5
addps %xmm1, %xmm2 #83.5
subps %xmm1, %xmm5 #83.5
shufps $177, %xmm10, %xmm10 #83.5
xorps %xmm8, %xmm9 #83.5
shufps $177, %xmm3, %xmm3 #83.5
movaps %xmm6, %xmm1 #83.5
mulps %xmm0, %xmm10 #83.5
movaps %xmm4, %xmm13 #83.5
mulps %xmm0, %xmm3 #83.5
subps %xmm10, %xmm12 #83.5
addps %xmm3, %xmm11 #83.5
movaps %xmm12, %xmm3 #83.5
movaps %xmm7, %xmm14 #83.5
shufps $177, %xmm9, %xmm9 #83.5
subps %xmm11, %xmm12 #83.5
addps %xmm11, %xmm3 #83.5
subps %xmm9, %xmm1 #83.5
addps %xmm9, %xmm6 #83.5
addps %xmm3, %xmm4 #83.5
subps %xmm3, %xmm13 #83.5
xorps %xmm8, %xmm12 #83.5
movaps %xmm2, %xmm3 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm6, %xmm9 #83.5
movslq 8(%r8, %rax, 4), %r12 #83.59
movlhps %xmm4, %xmm3 #83.5
addq $4, %rax
shufps $238, %xmm4, %xmm2 #83.5
movaps %xmm1, %xmm4 #83.5
#movntdq %xmm3, (%rdx,%r11,4) #83.5
subps %xmm12, %xmm7 #83.5
addps %xmm12, %xmm14 #83.5
movlhps %xmm7, %xmm4 #83.5
shufps $238, %xmm7, %xmm1 #83.5
movaps %xmm5, %xmm7 #83.5
movlhps %xmm13, %xmm7 #83.5
movlhps %xmm14, %xmm9 #83.5
shufps $238, %xmm13, %xmm5 #83.5
shufps $238, %xmm14, %xmm6 #83.5
movaps %xmm3, (%rdx,%r11,4) #83.5
movaps %xmm4, 16(%rdx,%r11,4) #83.5
movaps %xmm7, 32(%rdx,%r11,4) #83.5
movaps %xmm9, 48(%rdx,%r11,4) #83.5
movaps %xmm2, (%rdx,%r12,4) #83.5
movaps %xmm1, 16(%rdx,%r12,4) #83.5
movaps %xmm5, 32(%rdx,%r12,4) #83.5
movaps %xmm6, 48(%rdx,%r12,4) #83.5
cmpq %rcx, %rax
jne LEAF_EE_1
# _leaf_oo + 4 needs to be 16 byte aligned
#ifdef __APPLE__
.globl _leaf_oo
_leaf_oo:
#else
.globl leaf_oo
leaf_oo:
#endif
movaps (%r9), %xmm5 #92.7
LEAF_OO_1:
LEAF_OO_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
movaps %xmm4, %xmm6 #93.5
LEAF_OO_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
LEAF_OO_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
addps %xmm7, %xmm6 #93.5
subps %xmm7, %xmm4 #93.5
LEAF_OO_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
movaps %xmm10, %xmm9 #93.5
LEAF_OO_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
movaps %xmm6, %xmm3 #93.5
LEAF_OO_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
movaps %xmm1, %xmm2 #93.5
LEAF_OO_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
movaps %xmm4, %xmm15 #93.5
LEAF_OO_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
movaps %xmm14, %xmm13 #93.5
movslq (%r8, %rax, 4), %r11 #83.44
subps %xmm8, %xmm10 #93.5
addps %xmm8, %xmm9 #93.5
addps %xmm11, %xmm2 #93.5
subps %xmm12, %xmm14 #93.5
subps %xmm11, %xmm1 #93.5
addps %xmm12, %xmm13 #93.5
addps %xmm9, %xmm3 #93.5
subps %xmm9, %xmm6 #93.5
xorps %xmm5, %xmm10 #93.5
xorps %xmm5, %xmm14 #93.5
shufps $177, %xmm10, %xmm10 #93.5
movaps %xmm2, %xmm9 #93.5
shufps $177, %xmm14, %xmm14 #93.5
movaps %xmm6, %xmm7 #93.5
movslq 8(%r8, %rax, 4), %r12 #83.59
addq $4, %rax #92.18
addps %xmm10, %xmm4 #93.5
addps %xmm13, %xmm9 #93.5
subps %xmm13, %xmm2 #93.5
subps %xmm10, %xmm15 #93.5
movaps %xmm1, %xmm13 #93.5
movaps %xmm2, %xmm8 #93.5
movlhps %xmm4, %xmm7 #93.5
subps %xmm14, %xmm13 #93.5
addps %xmm14, %xmm1 #93.5
shufps $238, %xmm4, %xmm6 #93.5
movaps %xmm3, %xmm14 #93.5
movaps %xmm9, %xmm4 #93.5
movlhps %xmm15, %xmm14 #93.5
movlhps %xmm13, %xmm4 #93.5
movlhps %xmm1, %xmm8 #93.5
shufps $238, %xmm15, %xmm3 #93.5
shufps $238, %xmm13, %xmm9 #93.5
shufps $238, %xmm1, %xmm2 #93.5
movaps %xmm14, (%rdx,%r11,4) #93.5
movaps %xmm7, 16(%rdx,%r11,4) #93.5
movaps %xmm4, 32(%rdx,%r11,4) #93.5
movaps %xmm8, 48(%rdx,%r11,4) #93.5
movaps %xmm3, (%rdx,%r12,4) #93.5
movaps %xmm6, 16(%rdx,%r12,4) #93.5
movaps %xmm9, 32(%rdx,%r12,4) #93.5
movaps %xmm2, 48(%rdx,%r12,4) #93.5
cmpq %rcx, %rax
jne LEAF_OO_1 # Prob 95% #92.14
#ifdef __APPLE__
.globl _leaf_eo
_leaf_eo:
#else
.globl leaf_eo
leaf_eo:
#endif
LEAF_EO_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
LEAF_EO_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
movaps %xmm9, %xmm11 #88.5
LEAF_EO_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
movaps %xmm7, %xmm6 #88.5
LEAF_EO_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
subps %xmm5, %xmm7 #88.5
addps %xmm4, %xmm11 #88.5
subps %xmm4, %xmm9 #88.5
addps %xmm5, %xmm6 #88.5
movaps (%r9), %xmm3 #88.5
movaps %xmm11, %xmm10 #88.5
xorps %xmm3, %xmm7 #88.5
movaps %xmm9, %xmm8 #88.5
shufps $177, %xmm7, %xmm7 #88.5
addps %xmm6, %xmm10 #88.5
subps %xmm6, %xmm11 #88.5
subps %xmm7, %xmm8 #88.5
addps %xmm7, %xmm9 #88.5
movslq 8(%r8, %rax, 4), %r12 #83.59
movaps %xmm10, %xmm2 #88.5
movslq (%r8, %rax, 4), %r11 #83.44
movaps %xmm11, %xmm1 #88.5
shufps $238, %xmm8, %xmm10 #88.5
shufps $238, %xmm9, %xmm11 #88.5
movaps %xmm10, (%rdx,%r12,4) #88.5
movaps %xmm11, 16(%rdx,%r12,4) #88.5
LEAF_EO_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
LEAF_EO_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
movaps %xmm15, %xmm14 #88.5
LEAF_EO_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
addps %xmm12, %xmm14 #88.5
subps %xmm12, %xmm15 #88.5
LEAF_EO_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
movaps %xmm4, %xmm5 #88.5
movaps %xmm14, %xmm7 #88.5
addps %xmm13, %xmm5 #88.5
subps %xmm13, %xmm4 #88.5
movlhps %xmm8, %xmm2 #88.5
movaps %xmm5, %xmm8 #88.5
movlhps %xmm15, %xmm7 #88.5
xorps %xmm3, %xmm15 #88.5
movaps %xmm5, %xmm6 #88.5
subps %xmm14, %xmm5 #88.5
addps %xmm14, %xmm6 #88.5
movlhps %xmm9, %xmm1 #88.5
movaps %xmm4, %xmm14 #88.5
movlhps %xmm4, %xmm8 #88.5
movaps %xmm1, %xmm12 #88.5
shufps $177, %xmm15, %xmm15 #88.5
movaps 0x30(%r9), %xmm11 #88.5
addq $4, %rax #90.5
subps %xmm15, %xmm14 #88.5
mulps %xmm7, %xmm11 #88.5
addps %xmm15, %xmm4 #88.5
movaps 0x30(%r9), %xmm9 #88.5
movaps 0x40(%r9), %xmm15 #88.5
shufps $177, %xmm7, %xmm7 #88.5
mulps %xmm8, %xmm9 #88.5
mulps %xmm15, %xmm7 #88.5
shufps $177, %xmm8, %xmm8 #88.5
subps %xmm7, %xmm11 #88.5
mulps %xmm15, %xmm8 #88.5
movaps %xmm11, %xmm10 #88.5
addps %xmm8, %xmm9 #88.5
shufps $238, %xmm14, %xmm6 #88.5
subps %xmm9, %xmm11 #88.5
addps %xmm9, %xmm10 #88.5
xorps %xmm3, %xmm11 #88.5
movaps %xmm2, %xmm3 #88.5
shufps $177, %xmm11, %xmm11 #88.5
subps %xmm10, %xmm3 #88.5
addps %xmm10, %xmm2 #88.5
addps %xmm11, %xmm12 #88.5
subps %xmm11, %xmm1 #88.5
shufps $238, %xmm4, %xmm5 #88.5
movaps %xmm5, 48(%rdx,%r12,4) #88.5
movaps %xmm6, 32(%rdx,%r12,4) #88.5
movaps %xmm2, (%rdx,%r11,4) #88.5
movaps %xmm1, 16(%rdx,%r11,4) #88.5
movaps %xmm3, 32(%rdx,%r11,4) #88.5
movaps %xmm12, 48(%rdx,%r11,4) #88.5
#ifdef __APPLE__
.globl _leaf_oe
_leaf_oe:
#else
.globl leaf_oe
leaf_oe:
#endif
movaps (%r9), %xmm0 #59.5
#movaps 0x20(%r9), %xmm1 #59.5
LEAF_OE_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
LEAF_OE_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
movaps %xmm6, %xmm10 #70.5
shufps $228, %xmm8, %xmm10 #70.5
movaps %xmm10, %xmm9 #70.5
shufps $228, %xmm6, %xmm8 #70.5
LEAF_OE_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
LEAF_OE_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
movaps %xmm12, %xmm14 #70.5
movslq (%r8, %rax, 4), %r11 #83.44
addps %xmm8, %xmm9 #70.5
subps %xmm8, %xmm10 #70.5
addps %xmm7, %xmm14 #70.5
subps %xmm7, %xmm12 #70.5
movaps %xmm9, %xmm4 #70.5
movaps %xmm14, %xmm13 #70.5
shufps $238, %xmm10, %xmm4 #70.5
xorps %xmm0, %xmm10 #70.5
shufps $177, %xmm10, %xmm10 #70.5
movaps %xmm12, %xmm11 #70.5
movaps %xmm14, %xmm5 #70.5
addps %xmm9, %xmm13 #70.5
subps %xmm10, %xmm11 #70.5
subps %xmm9, %xmm14 #70.5
shufps $238, %xmm12, %xmm5 #70.5
addps %xmm10, %xmm12 #70.5
movslq 8(%r8, %rax, 4), %r12 #83.59
movlhps %xmm11, %xmm13 #70.5
movaps %xmm13, (%rdx,%r11,4) #70.5
movaps 0x30(%r9), %xmm13 #70.5
movlhps %xmm12, %xmm14 #70.5
movaps 0x40(%r9), %xmm12 #70.5
mulps %xmm5, %xmm13 #70.5
shufps $177, %xmm5, %xmm5 #70.5
mulps %xmm12, %xmm5 #70.5
movaps %xmm14, 16(%rdx,%r11,4) #70.5
subps %xmm5, %xmm13 #70.5
movaps 0x30(%r9), %xmm5 #70.5
mulps %xmm4, %xmm5 #70.5
shufps $177, %xmm4, %xmm4 #70.5
mulps %xmm12, %xmm4 #70.5
LEAF_OE_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
addps %xmm4, %xmm5 #70.5
LEAF_OE_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
movaps %xmm9, %xmm3 #70.5
LEAF_OE_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
movaps %xmm7, %xmm6 #70.5
LEAF_OE_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
movaps %xmm13, %xmm4 #70.5
subps %xmm2, %xmm7 #70.5
addps %xmm15, %xmm3 #70.5
subps %xmm15, %xmm9 #70.5
addps %xmm2, %xmm6 #70.5
subps %xmm5, %xmm13 #70.5
addps %xmm5, %xmm4 #70.5
xorps %xmm0, %xmm7 #70.5
addq $4, %rax #72.5
movaps %xmm3, %xmm2 #70.5
shufps $177, %xmm7, %xmm7 #70.5
movaps %xmm9, %xmm8 #70.5
xorps %xmm0, %xmm13 #70.5
addps %xmm6, %xmm2 #70.5
subps %xmm7, %xmm8 #70.5
subps %xmm6, %xmm3 #70.5
addps %xmm7, %xmm9 #70.5
movaps %xmm2, %xmm10 #70.5
movaps %xmm3, %xmm11 #70.5
shufps $238, %xmm8, %xmm2 #70.5
shufps $238, %xmm9, %xmm3 #70.5
movaps %xmm2, %xmm14 #70.5
shufps $177, %xmm13, %xmm13 #70.5
subps %xmm4, %xmm14 #70.5
addps %xmm4, %xmm2 #70.5
movaps %xmm3, %xmm4 #70.5
subps %xmm13, %xmm3 #70.5
addps %xmm13, %xmm4 #70.5
movlhps %xmm8, %xmm10 #70.5
movlhps %xmm9, %xmm11 #70.5
movaps %xmm10, 32(%rdx,%r11,4) #70.5
movaps %xmm11, 48(%rdx,%r11,4) #70.5
movaps %xmm2, (%rdx,%r12,4) #70.5
movaps %xmm3, 16(%rdx,%r12,4) #70.5
movaps %xmm14, 32(%rdx,%r12,4) #70.5
movaps %xmm4, 48(%rdx,%r12,4) #70.5
#ifdef __APPLE__
.globl _leaf_end
_leaf_end:
#else
.globl leaf_end
leaf_end:
#endif
#ifdef __APPLE__
.globl _x_init
_x_init:
#else
.globl x_init
x_init:
#endif
#movaps L_sse_constants(%rip), %xmm3 #34.3
movaps (%r9), %xmm3 #34.3
movq 0x20(%rdi),%r8
#ifdef __APPLE__
.globl _x4
_x4:
#else
.globl x4
x4:
#endif
movaps 64(%rdx), %xmm0 #34.3
movaps 96(%rdx), %xmm1 #34.3
movaps (%rdx), %xmm7 #34.3
movaps (%r8), %xmm4 #const
movaps %xmm7, %xmm9 #34.3
movaps %xmm4, %xmm6 #34.3
movaps 16(%r8), %xmm2 #const
mulps %xmm0, %xmm6 #34.3
mulps %xmm1, %xmm4 #34.3
shufps $177, %xmm0, %xmm0 #34.3
shufps $177, %xmm1, %xmm1 #34.3
mulps %xmm2, %xmm0 #34.3
mulps %xmm1, %xmm2 #34.3
subps %xmm0, %xmm6 #34.3
addps %xmm2, %xmm4 #34.3
movaps %xmm6, %xmm5 #34.3
subps %xmm4, %xmm6 #34.3
addps %xmm4, %xmm5 #34.3
movaps 32(%rdx), %xmm8 #34.3
xorps %xmm3, %xmm6 #34.3
shufps $177, %xmm6, %xmm6 #34.3
movaps %xmm8, %xmm10 #34.3
movaps 112(%rdx), %xmm12 #34.3
subps %xmm5, %xmm9 #34.3
addps %xmm5, %xmm7 #34.3
addps %xmm6, %xmm10 #34.3
subps %xmm6, %xmm8 #34.3
movaps %xmm7, (%rdx) #34.3
movaps %xmm8, 32(%rdx) #34.3
movaps %xmm9, 64(%rdx) #34.3
movaps %xmm10, 96(%rdx) #34.3
movaps 32(%r8), %xmm14 #const #34.3
movaps 80(%rdx), %xmm11 #34.3
movaps %xmm14, %xmm0 #34.3
movaps 48(%r8), %xmm13 #const #34.3
mulps %xmm11, %xmm0 #34.3
mulps %xmm12, %xmm14 #34.3
shufps $177, %xmm11, %xmm11 #34.3
shufps $177, %xmm12, %xmm12 #34.3
mulps %xmm13, %xmm11 #34.3
mulps %xmm12, %xmm13 #34.3
subps %xmm11, %xmm0 #34.3
addps %xmm13, %xmm14 #34.3
movaps %xmm0, %xmm15 #34.3
subps %xmm14, %xmm0 #34.3
addps %xmm14, %xmm15 #34.3
xorps %xmm3, %xmm0 #34.3
movaps 16(%rdx), %xmm1 #34.3
movaps 48(%rdx), %xmm2 #34.3
movaps %xmm1, %xmm4 #34.3
shufps $177, %xmm0, %xmm0 #34.3
movaps %xmm2, %xmm5 #34.3
addps %xmm15, %xmm1 #34.3
subps %xmm0, %xmm2 #34.3
subps %xmm15, %xmm4 #34.3
addps %xmm0, %xmm5 #34.3
movaps %xmm1, 16(%rdx) #34.3
movaps %xmm2, 48(%rdx) #34.3
movaps %xmm4, 80(%rdx) #34.3
movaps %xmm5, 112(%rdx) #34.3
ret
# _x8_soft + 5 needs to be 16 byte aligned
#ifdef __APPLE__
.globl _x8_soft
_x8_soft:
#else
.globl x8_soft
x8_soft:
#endif
xorl %eax, %eax
movq %rdx, %rbx
movq %r8, %rsi
leaq (%rdx,%rcx,4), %r9
leaq (%r9,%rcx,4), %r10
leaq (%r10,%rcx,4), %r11
leaq (%r11,%rcx,4), %r12
leaq (%r12,%rcx,4), %r13
leaq (%r13,%rcx,4), %r14
leaq (%r14,%rcx,4), %r15
X8_soft_loop:
movaps (%rsi), %xmm9
movaps (%r10,%rax,4), %xmm6
movaps %xmm9, %xmm11
movaps (%r11,%rax,4), %xmm7
movaps 16(%rsi), %xmm8
mulps %xmm6, %xmm11
mulps %xmm7, %xmm9
shufps $177, %xmm6, %xmm6
mulps %xmm8, %xmm6
shufps $177, %xmm7, %xmm7
subps %xmm6, %xmm11
mulps %xmm7, %xmm8
movaps %xmm11, %xmm10
addps %xmm8, %xmm9
movaps 32(%rsi), %xmm15
addps %xmm9, %xmm10
subps %xmm9, %xmm11
movaps (%rbx,%rax,4), %xmm5
movaps %xmm15, %xmm6
movaps (%r12,%rax,4), %xmm12
movaps %xmm5, %xmm2
movaps (%r14,%rax,4), %xmm13
xorps %xmm3, %xmm11 #const
movaps 48(%rsi), %xmm14
subps %xmm10, %xmm2
mulps %xmm12, %xmm6
addps %xmm10, %xmm5
mulps %xmm13, %xmm15
movaps 64(%rsi), %xmm10
movaps %xmm5, %xmm0
shufps $177, %xmm12, %xmm12
shufps $177, %xmm13, %xmm13
mulps %xmm14, %xmm12
mulps %xmm13, %xmm14
subps %xmm12, %xmm6
addps %xmm14, %xmm15
movaps (%r13,%rax,4), %xmm7
movaps %xmm10, %xmm13
movaps (%r15,%rax,4), %xmm8
movaps %xmm6, %xmm12
movaps 80(%rsi), %xmm9
addq $96, %rsi
mulps %xmm7, %xmm13
subps %xmm15, %xmm6
addps %xmm15, %xmm12
mulps %xmm8, %xmm10
subps %xmm12, %xmm0
addps %xmm12, %xmm5
shufps $177, %xmm7, %xmm7
xorps %xmm3, %xmm6 #const
shufps $177, %xmm8, %xmm8
movaps %xmm2, %xmm12
mulps %xmm9, %xmm7
mulps %xmm8, %xmm9
subps %xmm7, %xmm13
addps %xmm9, %xmm10
movaps (%r9,%rax,4), %xmm4
shufps $177, %xmm11, %xmm11
movaps %xmm4, %xmm1
shufps $177, %xmm6, %xmm6
addps %xmm11, %xmm1
subps %xmm11, %xmm4
addps %xmm6, %xmm12
subps %xmm6, %xmm2
movaps %xmm13, %xmm11
movaps %xmm4, %xmm14
movaps %xmm1, %xmm6
subps %xmm10, %xmm13
addps %xmm10, %xmm11
xorps %xmm3, %xmm13 #const
addps %xmm11, %xmm4
subps %xmm11, %xmm14
shufps $177, %xmm13, %xmm13
movaps %xmm5, (%rbx,%rax,4)
movaps %xmm4, (%r9,%rax,4)
movaps %xmm2, (%r10,%rax,4)
subps %xmm13, %xmm1
addps %xmm13, %xmm6
movaps %xmm1, (%r11,%rax,4)
movaps %xmm0, (%r12,%rax,4)
movaps %xmm14, (%r13,%rax,4)
movaps %xmm12, (%r14,%rax,4)
movaps %xmm6, (%r15,%rax,4)
addq $4, %rax
cmpq %rcx, %rax
jne X8_soft_loop
ret
#ifdef __APPLE__
.globl _x8_hard
_x8_hard:
#else
.globl x8_hard
x8_hard:
#endif
movaps (%r9), %xmm5
X8_loop:
movaps (%r8), %xmm9
X8_const_2:
movaps 0xFECA(%rdx,%rax,4), %xmm6
movaps %xmm9, %xmm11
X8_const_3:
movaps 0xFECA(%rdx,%rax,4), %xmm7
movaps 16(%r8), %xmm8
mulps %xmm6, %xmm11
mulps %xmm7, %xmm9
shufps $177, %xmm6, %xmm6
mulps %xmm8, %xmm6
shufps $177, %xmm7, %xmm7
subps %xmm6, %xmm11
mulps %xmm7, %xmm8
movaps %xmm11, %xmm10
addps %xmm8, %xmm9
movaps 32(%r8), %xmm15
addps %xmm9, %xmm10
subps %xmm9, %xmm11
X8_const_0:
movaps 0xFECA(%rdx,%rax,4), %xmm3
movaps %xmm15, %xmm6
X8_const_4:
movaps 0xFECA(%rdx,%rax,4), %xmm12
movaps %xmm3, %xmm2
X8_const_6:
movaps 0xFECA(%rdx,%rax,4), %xmm13
xorps %xmm5, %xmm11
movaps 48(%r8), %xmm14
subps %xmm10, %xmm2
mulps %xmm12, %xmm6
addps %xmm10, %xmm3
mulps %xmm13, %xmm15
movaps 64(%r8), %xmm10
movaps %xmm3, %xmm0
shufps $177, %xmm12, %xmm12
shufps $177, %xmm13, %xmm13
mulps %xmm14, %xmm12
mulps %xmm13, %xmm14
subps %xmm12, %xmm6
addps %xmm14, %xmm15
X8_const_5:
movaps 0xFECA(%rdx,%rax,4), %xmm7
movaps %xmm10, %xmm13
X8_const_7:
movaps 0xFECA(%rdx,%rax,4), %xmm8
movaps %xmm6, %xmm12
movaps 80(%r8), %xmm9
addq $96, %r8
mulps %xmm7, %xmm13
subps %xmm15, %xmm6
addps %xmm15, %xmm12
mulps %xmm8, %xmm10
subps %xmm12, %xmm0
addps %xmm12, %xmm3
shufps $177, %xmm7, %xmm7
xorps %xmm5, %xmm6
shufps $177, %xmm8, %xmm8
movaps %xmm2, %xmm12
mulps %xmm9, %xmm7
mulps %xmm8, %xmm9
subps %xmm7, %xmm13
addps %xmm9, %xmm10
X8_const_1:
movaps 0xFECA(%rdx,%rax,4), %xmm4
shufps $177, %xmm11, %xmm11
movaps %xmm4, %xmm1
shufps $177, %xmm6, %xmm6
addps %xmm11, %xmm1
subps %xmm11, %xmm4
addps %xmm6, %xmm12
subps %xmm6, %xmm2
movaps %xmm13, %xmm11
movaps %xmm4, %xmm14
movaps %xmm1, %xmm6
subps %xmm10, %xmm13
addps %xmm10, %xmm11
xorps %xmm5, %xmm13
addps %xmm11, %xmm4
subps %xmm11, %xmm14
shufps $177, %xmm13, %xmm13
X8_const1_0:
movaps %xmm3, 0xFECA(%rdx,%rax,4)
X8_const1_1:
movaps %xmm4, 0xFECA(%rdx,%rax,4)
X8_const1_2:
movaps %xmm2, 0xFECA(%rdx,%rax,4)
subps %xmm13, %xmm1
addps %xmm13, %xmm6
X8_const1_3:
movaps %xmm1, 0xFECA(%rdx,%rax,4)
X8_const1_4:
movaps %xmm0, 0xFECA(%rdx,%rax,4)
X8_const1_5:
movaps %xmm14, 0xFECA(%rdx,%rax,4)
X8_const1_6:
movaps %xmm12, 0xFECA(%rdx,%rax,4)
X8_const1_7:
movaps %xmm6, 0xFECA(%rdx,%rax,4)
addq $4, %rax
cmpq %rcx, %rax
jne X8_loop
#ifdef __APPLE__
.globl _sse_leaf_ee_offsets
.globl _sse_leaf_oo_offsets
.globl _sse_leaf_eo_offsets
.globl _sse_leaf_oe_offsets
.align 4
_sse_leaf_ee_offsets:
.long LEAF_EE_const_0-_leaf_ee+0x4
.long LEAF_EE_const_1-_leaf_ee+0x5
.long LEAF_EE_const_2-_leaf_ee+0x5
.long LEAF_EE_const_3-_leaf_ee+0x5
.long LEAF_EE_const_4-_leaf_ee+0x5
.long LEAF_EE_const_5-_leaf_ee+0x5
.long LEAF_EE_const_6-_leaf_ee+0x4
.long LEAF_EE_const_7-_leaf_ee+0x5
_sse_leaf_oo_offsets:
.long LEAF_OO_const_0-_leaf_oo+0x4
.long LEAF_OO_const_1-_leaf_oo+0x4
.long LEAF_OO_const_2-_leaf_oo+0x5
.long LEAF_OO_const_3-_leaf_oo+0x5
.long LEAF_OO_const_4-_leaf_oo+0x4
.long LEAF_OO_const_5-_leaf_oo+0x5
.long LEAF_OO_const_6-_leaf_oo+0x5
.long LEAF_OO_const_7-_leaf_oo+0x5
_sse_leaf_eo_offsets:
.long LEAF_EO_const_0-_leaf_eo+0x5
.long LEAF_EO_const_1-_leaf_eo+0x4
.long LEAF_EO_const_2-_leaf_eo+0x4
.long LEAF_EO_const_3-_leaf_eo+0x4
.long LEAF_EO_const_4-_leaf_eo+0x5
.long LEAF_EO_const_5-_leaf_eo+0x5
.long LEAF_EO_const_6-_leaf_eo+0x4
.long LEAF_EO_const_7-_leaf_eo+0x5
_sse_leaf_oe_offsets:
.long LEAF_OE_const_0-_leaf_oe+0x5
.long LEAF_OE_const_1-_leaf_oe+0x4
.long LEAF_OE_const_2-_leaf_oe+0x4
.long LEAF_OE_const_3-_leaf_oe+0x5
.long LEAF_OE_const_4-_leaf_oe+0x5
.long LEAF_OE_const_5-_leaf_oe+0x5
.long LEAF_OE_const_6-_leaf_oe+0x4
.long LEAF_OE_const_7-_leaf_oe+0x4
#else
.globl sse_leaf_ee_offsets
.globl sse_leaf_oo_offsets
.globl sse_leaf_eo_offsets
.globl sse_leaf_oe_offsets
.align 4
sse_leaf_ee_offsets:
.long LEAF_EE_const_0-leaf_ee+0x4
.long LEAF_EE_const_1-leaf_ee+0x5
.long LEAF_EE_const_2-leaf_ee+0x5
.long LEAF_EE_const_3-leaf_ee+0x5
.long LEAF_EE_const_4-leaf_ee+0x5
.long LEAF_EE_const_5-leaf_ee+0x5
.long LEAF_EE_const_6-leaf_ee+0x4
.long LEAF_EE_const_7-leaf_ee+0x5
sse_leaf_oo_offsets:
.long LEAF_OO_const_0-leaf_oo+0x4
.long LEAF_OO_const_1-leaf_oo+0x4
.long LEAF_OO_const_2-leaf_oo+0x5
.long LEAF_OO_const_3-leaf_oo+0x5
.long LEAF_OO_const_4-leaf_oo+0x4
.long LEAF_OO_const_5-leaf_oo+0x5
.long LEAF_OO_const_6-leaf_oo+0x5
.long LEAF_OO_const_7-leaf_oo+0x5
sse_leaf_eo_offsets:
.long LEAF_EO_const_0-leaf_eo+0x5
.long LEAF_EO_const_1-leaf_eo+0x4
.long LEAF_EO_const_2-leaf_eo+0x4
.long LEAF_EO_const_3-leaf_eo+0x4
.long LEAF_EO_const_4-leaf_eo+0x5
.long LEAF_EO_const_5-leaf_eo+0x5
.long LEAF_EO_const_6-leaf_eo+0x4
.long LEAF_EO_const_7-leaf_eo+0x5
sse_leaf_oe_offsets:
.long LEAF_OE_const_0-leaf_oe+0x5
.long LEAF_OE_const_1-leaf_oe+0x4
.long LEAF_OE_const_2-leaf_oe+0x4
.long LEAF_OE_const_3-leaf_oe+0x5
.long LEAF_OE_const_4-leaf_oe+0x5
.long LEAF_OE_const_5-leaf_oe+0x5
.long LEAF_OE_const_6-leaf_oe+0x4
.long LEAF_OE_const_7-leaf_oe+0x4
#endif
#ifdef __APPLE__
.data
#else
.section .data
#endif
.p2align 4
#ifdef __APPLE__
.globl _sse_constants
_sse_constants:
#else
.globl sse_constants
sse_constants:
#endif
.long 0x00000000,0x80000000,0x00000000,0x80000000
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
.long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
.long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
#ifdef __APPLE__
.globl _sse_constants_inv
_sse_constants_inv:
#else
.globl sse_constants_inv
sse_constants_inv:
#endif
.long 0x80000000,0x00000000,0x80000000,0x00000000
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
.long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
.long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3

50
3rdparty/ffts/ffts-master/src/types.h vendored Normal file
View File

@@ -0,0 +1,50 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __TYPES_H__
#define __TYPES_H__
#define __INLINE static inline __attribute__((always_inline))
#if defined(complex)
typedef complex float cdata_t;
#else
typedef float cdata_t[2];
#endif
typedef float data_t;
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

46
3rdparty/ffts/ffts-master/src/vfp.h vendored Normal file
View File

@@ -0,0 +1,46 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, 2013 The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __VFP_H__
#define __VFP_H__
#include "ffts.h"
void vfp_e();
void vfp_o();
void vfp_x4();
void vfp_x8();
void vfp_end();
#endif
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:

473
3rdparty/ffts/ffts-master/src/vfp.s vendored Normal file
View File

@@ -0,0 +1,473 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, 2013 The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@ assumes r0 = out
@ r1 = in ?
@
@ r12 = offsets
@ r3-r10 = data pointers
@ r11 = loop iterations
@ r2 = const pointer
@ & lr = temps
.align 4
#ifdef __APPLE__
.globl _vfp_e
_vfp_e:
#else
.globl vfp_e
vfp_e:
#endif
_vfp_e_loop:
vldr s15, [r2, #8]
vldr s2, [r3] @ x0
vldr s0, [r3, #4]
vldr s4, [r4] @ x1
vldr s11, [r2]
vldr s10, [r7] @ x4
vldr s3, [r7, #4]
vldr s8, [r8] @ x5
vldr s1, [r8, #4]
vldr s14, [r9] @ x6
vldr s9, [r9, #4]
vldr s6, [r10] @ x7
vldr s12, [r10, #4]
vsub.f32 s18, s3, s1
vsub.f32 s7, s10, s8
vsub.f32 s5, s14, s6
vadd.f32 s6, s14, s6
vldr s24, [r5, #4]
vsub.f32 s14, s9, s12
vldr s22, [r6, #4]
vadd.f32 s8, s10, s8
vldr s28, [r6] @ x3
vldr s17, [r5] @ x2
vadd.f32 s10, s9, s12
vmul.f32 s13, s18, s15
vmul.f32 s9, s7, s11
vmul.f32 s16, s5, s11
vmul.f32 s18, s18, s11
vmul.f32 s30, s14, s11
vldr s11, [r4, #4]
add r3, r3, #8
add r4, r4, #8
add r5, r5, #8
add r6, r6, #8
add r7, r7, #8
add r8, r8, #8
add r9, r9, #8
add r10, r10, #8
vmul.f32 s12, s5, s15
vmul.f32 s20, s14, s15
vadd.f32 s5, s2, s4
vadd.f32 s3, s3, s1
vmul.f32 s15, s7, s15
vadd.f32 s1, s24, s22
vsub.f32 s7, s24, s22
vadd.f32 s24, s17, s28
vadd.f32 s26, s0, s11
vsub.f32 s14, s9, s13
vsub.f32 s2, s2, s4
vadd.f32 s4, s16, s20
vsub.f32 s22, s0, s11
vsub.f32 s16, s17, s28
vadd.f32 s9, s5, s24
vadd.f32 s28, s18, s15
vadd.f32 s13, s8, s6
vsub.f32 s5, s5, s24
vsub.f32 s24, s8, s6
vadd.f32 s11, s26, s1
vsub.f32 s12, s30, s12
vadd.f32 s20, s3, s10
vsub.f32 s15, s3, s10
vsub.f32 s3, s26, s1
vadd.f32 s18, s9, s13
vadd.f32 s10, s14, s4
vadd.f32 s6, s2, s7 @
vsub.f32 s0, s2, s7 @
vadd.f32 s26, s11, s20
vsub.f32 s4, s14, s4
vsub.f32 s8, s22, s16 @
vadd.f32 s1, s28, s12
ldr lr, [r12], #4
add lr, r0, lr, lsl #2
subs r11, r11, #1
vstr s18, [lr]
vsub.f32 s2, s28, s12
vadd.f32 s12, s22, s16 @
vsub.f32 s16, s3, s24 @
vsub.f32 s13, s9, s13
vstr s26, [lr, #4]
vadd.f32 s28, s5, s15 @
vsub.f32 s7, s5, s15 @
vadd.f32 s14, s6, s10
vadd.f32 s5, s8, s1
vadd.f32 s9, s0, s2 @
vsub.f32 s2, s0, s2 @
vsub.f32 s11, s11, s20
vstr s28, [lr, #16]
vadd.f32 s3, s3, s24 @
vstr s16, [lr, #20]
vsub.f32 s6, s6, s10
vstr s13, [lr, #32]
vsub.f32 s13, s12, s4 @
vsub.f32 s8, s8, s1
vadd.f32 s0, s12, s4 @
vstr s11, [lr, #36]
vstr s7, [lr, #48]
vstr s3, [lr, #52]
vstr s14, [lr, #8]
vstr s5, [lr, #12]
vstr s9, [lr, #24]
vstr s13, [lr, #28]
vstr s6, [lr, #40]
vstr s8, [lr, #44]
vstr s2, [lr, #56]
vstr s0, [lr, #60]
bne _vfp_e_loop
@ assumes r0 = out
@ r1 = in ?
@
@ r12 = offsets
@ r3-r10 = data pointers
@ r11 = loop iterations
@ r2 & lr = temps
.align 4
#ifdef __APPLE__
.globl _vfp_o
_vfp_o:
#else
.globl vfp_o
vfp_o:
#endif
_vfp_o_loop:
vldr s4, [r3] @ x0
vldr s0, [r3, #4]
vldr s6, [r4] @ x1
vldr s5, [r4, #4]
vldr s7, [r5] @ x2
vldr s1, [r5, #4]
vldr s3, [r6] @ x3
vldr s8, [r6, #4]
subs r11, r11, #1
ldr r2, [r12], #4
add r2, r0, r2, lsl #2
vadd.f32 s2, s4, s6
vadd.f32 s14, s0, s5
vadd.f32 s10, s1, s8
vsub.f32 s4, s4, s6
vsub.f32 s0, s0, s5
vadd.f32 s12, s7, s3
vsub.f32 s6, s7, s3
vsub.f32 s8, s1, s8
vadd.f32 s5, s14, s10
vsub.f32 s10, s14, s10
vadd.f32 s7, s2, s12
vsub.f32 s1, s0, s6 @
vsub.f32 s12, s2, s12
vadd.f32 s3, s4, s8 @
vsub.f32 s2, s4, s8 @
vadd.f32 s0, s0, s6 @
vstr s7, [r2]
vldr s7, [r9] @ x2
vstr s5, [r2, #4]
vstr s3, [r2, #8]
vstr s1, [r2, #12]
vstr s12, [r2, #16]
vstr s10, [r2, #20]
vstr s2, [r2, #24]
vstr s0, [r2, #28]
vldr s4, [r7] @ x0
vldr s0, [r7, #4]
vldr s6, [r8] @ x1
vldr s5, [r8, #4]
vldr s3, [r10] @ x3
vldr s8, [r10, #4]
vldr s1, [r9, #4]
add r3, r3, #8
add r4, r4, #8
add r5, r5, #8
add r6, r6, #8
add r7, r7, #8
add r8, r8, #8
add r9, r9, #8
add r10, r10, #8
vadd.f32 s2, s4, s6
vadd.f32 s14, s0, s5
vadd.f32 s10, s1, s8
vsub.f32 s4, s4, s6
vsub.f32 s0, s0, s5
vadd.f32 s12, s7, s3
vsub.f32 s6, s7, s3
vsub.f32 s8, s1, s8
vadd.f32 s5, s14, s10
vsub.f32 s10, s14, s10
vadd.f32 s7, s2, s12
vsub.f32 s1, s0, s6 @
vsub.f32 s12, s2, s12
vadd.f32 s3, s4, s8 @
vsub.f32 s2, s4, s8 @
vadd.f32 s0, s0, s6 @
vstr s7, [r2, #32]
vstr s5, [r2, #36]
vstr s3, [r2, #40]
vstr s1, [r2, #44]
vstr s12, [r2, #48]
vstr s10, [r2, #52]
vstr s2, [r2, #56]
vstr s0, [r2, #60]
bne _vfp_o_loop
.align 4
#ifdef __APPLE__
.globl _vfp_x4
_vfp_x4:
#else
.globl vfp_x4
vfp_x4:
#endif
add r3, r0, #0
add r7, r2, #0
add r4, r0, r1, lsl #1
add r5, r0, r1, lsl #2
add r6, r4, r1, lsl #2
mov r11, #4
_vfp_x4_loop:
vldr s8, [r3, #0]
vldr s9, [r3, #4]
vldr s10, [r4, #0]
vldr s11, [r4, #4]
vldr s12, [r5, #0]
vldr s13, [r5, #4]
vldr s14, [r6, #0]
vldr s15, [r6, #4]
vldr s2, [r7, #0]
vldr s3, [r7, #4]
add r7, r7, #8
subs r11, r11, #1
vmul.f32 s0, s13, s3
vmul.f32 s5, s12, s2
vmul.f32 s1, s14, s2
vmul.f32 s4, s14, s3
vmul.f32 s14, s12, s3
vmul.f32 s13, s13, s2
vmul.f32 s12, s15, s3
vmul.f32 s2, s15, s2
vsub.f32 s0, s5, s0
vadd.f32 s13, s13, s14
vadd.f32 s12, s12, s1
vsub.f32 s1, s2, s4
vadd.f32 s15, s0, s12
vsub.f32 s12, s0, s12
vadd.f32 s14, s13, s1
vsub.f32 s13, s13, s1
vadd.f32 s0, s8, s15
vadd.f32 s1, s9, s14
vadd.f32 s2, s10, s13 @
vsub.f32 s4, s8, s15
vsub.f32 s3, s11, s12 @
vstr s0, [r3, #0]
vstr s1, [r3, #4]
add r3, r3, #8
vsub.f32 s5, s9, s14
vsub.f32 s6, s10, s13 @
vadd.f32 s7, s11, s12 @
vstr s2, [r4, #0]
vstr s3, [r4, #4]
add r4, r4, #8
vstr s4, [r5, #0]
vstr s5, [r5, #4]
add r5, r5, #8
vstr s6, [r6, #0]
vstr s7, [r6, #4]
add r6, r6, #8
bne _vfp_x4_loop
bx lr
.align 4
#ifdef __APPLE__
.globl _vfp_x8
_vfp_x8:
#else
.globl vfp_x8
vfp_x8:
#endif
mov r11, #0
add r3, r0, #0 @ data0
add r5, r0, r1, lsl #1 @ data2
add r4, r0, r1 @ data1
add r7, r5, r1, lsl #1 @ data4
add r6, r5, r1 @ data3
add r9, r7, r1, lsl #1 @ data6
add r8, r7, r1 @ data5
add r10, r9, r1 @ data7
add r12, r2, #0 @ LUT
sub r11, r11, r1, lsr #3
_vfp_x8_loop:
vldr s10, [r3, #0] @ x0-re
vldr s8, [r3, #4] @ x0-im
vldr s2, [r4, #0] @ x1-re
vldr s0, [r4, #4] @ x1-im
vldr s6, [r5, #0] @ x2-re
vldr s4, [r5, #4] @ x2-im
vldr s13, [r6, #0] @ x3-re
vldr s15, [r6, #4] @ x3-im
vldr s7, [r12]
vldr s11, [r12, #4]
vldr s5, [r7, #0] @ x4-re
vldr s1, [r7, #4] @ x4-im
vldr s28, [r9, #0] @ x6-re
vldr s18, [r9, #4] @ x6-im
adds r11, r11, #1
vmul.f32 s14, s15, s7
vldr s24, [r12, #12]
vmul.f32 s12, s13, s11
vmul.f32 s26, s13, s7
vldr s13, [r12, #8]
vmul.f32 s3, s4, s11
vmul.f32 s15, s15, s11
vmul.f32 s16, s4, s7
vmul.f32 s9, s6, s7
vmul.f32 s11, s6, s11
vmul.f32 s7, s18, s24
vmul.f32 s20, s1, s24
vmul.f32 s30, s5, s13
vadd.f32 s4, s26, s15
vsub.f32 s12, s14, s12
vsub.f32 s6, s9, s3
vadd.f32 s14, s16, s11
vmul.f32 s22, s28, s13
vmul.f32 s26, s28, s24
vmul.f32 s18, s18, s13
vmul.f32 s5, s5, s24
vmul.f32 s1, s1, s13
vsub.f32 s9, s30, s20
vadd.f32 s16, s14, s12
vadd.f32 s3, s22, s7
vadd.f32 s15, s6, s4
vsub.f32 s11, s18, s26
vadd.f32 s18, s1, s5
vadd.f32 s13, s8, s16
vadd.f32 s1, s9, s3
vadd.f32 s7, s10, s15
vsub.f32 s15, s10, s15
vsub.f32 s10, s9, s3
vadd.f32 s5, s18, s11
vsub.f32 s11, s18, s11
vsub.f32 s8, s8, s16
vadd.f32 s20, s7, s1
vsub.f32 s7, s7, s1
vadd.f32 s18, s13, s5
vadd.f32 s16, s15, s11 @
vsub.f32 s9, s8, s10 @
vsub.f32 s3, s13, s5
vsub.f32 s1, s15, s11 @
vstr s20, [r3]
vadd.f32 s8, s8, s10 @
vstr s18, [r3, #4]
add r3, r3, #8
vstr s16, [r5]
vstr s9, [r5, #4]
add r5, r5, #8
vstr s7, [r7]
vstr s3, [r7, #4]
add r7, r7, #8
vstr s1, [r9]
vstr s8, [r9, #4]
add r9, r9, #8
vldr s10, [r8, #0] @ x5-re
vldr s8, [r8, #4] @ x5-im
vldr s5, [r10, #0] @ x7-re
vldr s11, [r10, #4] @ x7-im
vldr s1, [r12, #16]
vldr s15, [r12, #20]
add r12, r12, #24
vmul.f32 s9, s5, s1
vmul.f32 s3, s11, s15
vmul.f32 s13, s10, s1
vmul.f32 s7, s8, s15
vmul.f32 s5, s5, s15
vmul.f32 s11, s11, s1
vmul.f32 s10, s10, s15
vmul.f32 s15, s8, s1
vsub.f32 s1, s14, s12
vadd.f32 s8, s9, s3
vsub.f32 s3, s6, s4
vsub.f32 s12, s13, s7
vsub.f32 s5, s11, s5
vadd.f32 s7, s15, s10
vadd.f32 s4, s2, s1 @
vsub.f32 s2, s2, s1 @
vsub.f32 s6, s0, s3 @
vadd.f32 s10, s12, s8
vsub.f32 s9, s12, s8
vadd.f32 s0, s0, s3 @
vsub.f32 s1, s7, s5
vadd.f32 s14, s7, s5
vadd.f32 s7, s4, s10
vsub.f32 s8, s4, s10
vsub.f32 s12, s0, s9 @
vadd.f32 s3, s2, s1 @
vadd.f32 s5, s6, s14
vsub.f32 s4, s6, s14
vsub.f32 s2, s2, s1 @
vadd.f32 s0, s0, s9 @
vstr s7, [r4]
vstr s5, [r4, #4]
add r4, r4, #8
vstr s3, [r6]
vstr s12, [r6, #4]
add r6, r6, #8
vstr s8, [r8]
vstr s4, [r8, #4]
add r8, r8, #8
vstr s2, [r10]
vstr s0, [r10, #4]
add r10, r10, #8
bne _vfp_x8_loop
bx lr
.align 4
#ifdef __APPLE__
.globl _vfp_end
_vfp_end:
#else
.globl vfp_end
vfp_end:
#endif
bx lr