Windows implementation WIP
This commit is contained in:
34
3rdparty/ffts/ffts-master/src/Makefile.am
vendored
Normal file
34
3rdparty/ffts/ffts-master/src/Makefile.am
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
|
||||
lib_LTLIBRARIES = libffts.la
|
||||
|
||||
libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c ffts_real_nd.c patterns.c
|
||||
libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
|
||||
|
||||
if DYNAMIC_DISABLED
|
||||
libffts_la_SOURCES += ffts_static.c
|
||||
else
|
||||
libffts_la_SOURCES += codegen.c
|
||||
endif
|
||||
|
||||
libffts_includedir=$(includedir)/ffts
|
||||
libffts_include_HEADERS = ../include/ffts.h
|
||||
|
||||
|
||||
if HAVE_VFP
|
||||
libffts_la_SOURCES += vfp.s
|
||||
else
|
||||
if HAVE_NEON
|
||||
|
||||
libffts_la_SOURCES += neon.s
|
||||
|
||||
if DYNAMIC_DISABLED
|
||||
libffts_la_SOURCES += neon_static_f.s neon_static_i.s
|
||||
endif
|
||||
|
||||
else
|
||||
if HAVE_SSE
|
||||
libffts_la_SOURCES += sse.s
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
730
3rdparty/ffts/ffts-master/src/Makefile.in
vendored
Normal file
730
3rdparty/ffts/ffts-master/src/Makefile.in
vendored
Normal file
@@ -0,0 +1,730 @@
|
||||
# Makefile.in generated by automake 1.14 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2013 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
|
||||
@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
|
||||
@HAVE_VFP_TRUE@am__append_3 = vfp.s
|
||||
@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon.s
|
||||
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon_static_f.s neon_static_i.s
|
||||
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s
|
||||
subdir = src
|
||||
DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
|
||||
$(top_srcdir)/depcomp $(libffts_include_HEADERS)
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
|
||||
$(top_srcdir)/m4/ax_check_java_home.m4 \
|
||||
$(top_srcdir)/m4/ax_java_options.m4 \
|
||||
$(top_srcdir)/m4/ax_jni_include_dir.m4 \
|
||||
$(top_srcdir)/m4/ax_prog_jar.m4 \
|
||||
$(top_srcdir)/m4/ax_prog_javac.m4 \
|
||||
$(top_srcdir)/m4/ax_prog_javac_works.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
|
||||
am__vpath_adj = case $$p in \
|
||||
$(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
|
||||
*) f=$$p;; \
|
||||
esac;
|
||||
am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
|
||||
am__install_max = 40
|
||||
am__nobase_strip_setup = \
|
||||
srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
|
||||
am__nobase_strip = \
|
||||
for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
|
||||
am__nobase_list = $(am__nobase_strip_setup); \
|
||||
for p in $$list; do echo "$$p $$p"; done | \
|
||||
sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
|
||||
$(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
|
||||
if (++n[$$2] == $(am__install_max)) \
|
||||
{ print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
|
||||
END { for (dir in files) print dir, files[dir] }'
|
||||
am__base_list = \
|
||||
sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
|
||||
sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
|
||||
am__uninstall_files_from_dir = { \
|
||||
test -z "$$files" \
|
||||
|| { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
|
||||
|| { echo " ( cd '$$dir' && rm -f" $$files ")"; \
|
||||
$(am__cd) "$$dir" && rm -f $$files; }; \
|
||||
}
|
||||
am__installdirs = "$(DESTDIR)$(libdir)" \
|
||||
"$(DESTDIR)$(libffts_includedir)"
|
||||
LTLIBRARIES = $(lib_LTLIBRARIES)
|
||||
libffts_la_LIBADD =
|
||||
am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \
|
||||
ffts_real.c ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
|
||||
codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
|
||||
ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
|
||||
macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
|
||||
patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s neon.s \
|
||||
neon_static_f.s neon_static_i.s sse.s
|
||||
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
|
||||
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
|
||||
@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
|
||||
@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon.lo
|
||||
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon_static_f.lo \
|
||||
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@ neon_static_i.lo
|
||||
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 = \
|
||||
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@ sse.lo
|
||||
am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
|
||||
ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
|
||||
$(am__objects_3) $(am__objects_4) $(am__objects_5) \
|
||||
$(am__objects_6)
|
||||
libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__depfiles_maybe = depfiles
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
|
||||
LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
|
||||
$(CCASFLAGS)
|
||||
AM_V_CCAS = $(am__v_CCAS_@AM_V@)
|
||||
am__v_CCAS_ = $(am__v_CCAS_@AM_DEFAULT_V@)
|
||||
am__v_CCAS_0 = @echo " CCAS " $@;
|
||||
am__v_CCAS_1 =
|
||||
SOURCES = $(libffts_la_SOURCES)
|
||||
DIST_SOURCES = $(am__libffts_la_SOURCES_DIST)
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
HEADERS = $(libffts_include_HEADERS)
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
ACLOCAL = @ACLOCAL@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCAS = @CCAS@
|
||||
CCASDEPMODE = @CCASDEPMODE@
|
||||
CCASFLAGS = @CCASFLAGS@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CXX = @CXX@
|
||||
CXXCPP = @CXXCPP@
|
||||
CXXDEPMODE = @CXXDEPMODE@
|
||||
CXXFLAGS = @CXXFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
FGREP = @FGREP@
|
||||
GREP = @GREP@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
JAR = @JAR@
|
||||
JAVA = @JAVA@
|
||||
JAVAC = @JAVAC@
|
||||
JAVACFLAGS = @JAVACFLAGS@
|
||||
JAVAFLAGS = @JAVAFLAGS@
|
||||
JAVAPREFIX = @JAVAPREFIX@
|
||||
JAVA_PATH_NAME = @JAVA_PATH_NAME@
|
||||
JNI_CPPFLAGS = @JNI_CPPFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHELL = @SHELL@
|
||||
STRIP = @STRIP@
|
||||
VERSION = @VERSION@
|
||||
_ACJNI_JAVAC = @_ACJNI_JAVAC@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_CXX = @ac_ct_CXX@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
lib_LTLIBRARIES = libffts.la
|
||||
libffts_la_SOURCES = ffts.c ffts_small.c ffts_nd.c ffts_real.c \
|
||||
ffts_real_nd.c patterns.c codegen.h codegen_arm.h \
|
||||
codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
|
||||
ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
|
||||
macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
|
||||
patterns.h types.h vfp.h $(am__append_1) $(am__append_2) \
|
||||
$(am__append_3) $(am__append_4) $(am__append_5) \
|
||||
$(am__append_6)
|
||||
libffts_includedir = $(includedir)/ffts
|
||||
libffts_include_HEADERS = ../include/ffts.h
|
||||
all: all-am
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj .s
|
||||
$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu src/Makefile
|
||||
.PRECIOUS: Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
|
||||
esac;
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
install-libLTLIBRARIES: $(lib_LTLIBRARIES)
|
||||
@$(NORMAL_INSTALL)
|
||||
@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
|
||||
list2=; for p in $$list; do \
|
||||
if test -f $$p; then \
|
||||
list2="$$list2 $$p"; \
|
||||
else :; fi; \
|
||||
done; \
|
||||
test -z "$$list2" || { \
|
||||
echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
|
||||
$(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
|
||||
echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
|
||||
$(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
|
||||
}
|
||||
|
||||
uninstall-libLTLIBRARIES:
|
||||
@$(NORMAL_UNINSTALL)
|
||||
@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
|
||||
for p in $$list; do \
|
||||
$(am__strip_dir) \
|
||||
echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
|
||||
$(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
|
||||
done
|
||||
|
||||
clean-libLTLIBRARIES:
|
||||
-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
|
||||
@list='$(lib_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_nd.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_real_nd.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_small.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ffts_static.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
.s.o:
|
||||
$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $<
|
||||
|
||||
.s.obj:
|
||||
$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.s.lo:
|
||||
$(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
install-libffts_includeHEADERS: $(libffts_include_HEADERS)
|
||||
@$(NORMAL_INSTALL)
|
||||
@list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
|
||||
if test -n "$$list"; then \
|
||||
echo " $(MKDIR_P) '$(DESTDIR)$(libffts_includedir)'"; \
|
||||
$(MKDIR_P) "$(DESTDIR)$(libffts_includedir)" || exit 1; \
|
||||
fi; \
|
||||
for p in $$list; do \
|
||||
if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
|
||||
echo "$$d$$p"; \
|
||||
done | $(am__base_list) | \
|
||||
while read files; do \
|
||||
echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libffts_includedir)'"; \
|
||||
$(INSTALL_HEADER) $$files "$(DESTDIR)$(libffts_includedir)" || exit $$?; \
|
||||
done
|
||||
|
||||
uninstall-libffts_includeHEADERS:
|
||||
@$(NORMAL_UNINSTALL)
|
||||
@list='$(libffts_include_HEADERS)'; test -n "$(libffts_includedir)" || list=; \
|
||||
files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
|
||||
dir='$(DESTDIR)$(libffts_includedir)'; $(am__uninstall_files_from_dir)
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-am
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-am
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-am
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: check-am
|
||||
all-am: Makefile $(LTLIBRARIES) $(HEADERS)
|
||||
installdirs:
|
||||
for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libffts_includedir)"; do \
|
||||
test -z "$$dir" || $(MKDIR_P) "$$dir"; \
|
||||
done
|
||||
install: install-am
|
||||
install-exec: install-exec-am
|
||||
install-data: install-data-am
|
||||
uninstall: uninstall-am
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-am
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
clean: clean-am
|
||||
|
||||
clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-am
|
||||
-rm -rf ./$(DEPDIR)
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-am
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-am
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-am
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am: install-libffts_includeHEADERS
|
||||
|
||||
install-dvi: install-dvi-am
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am: install-libLTLIBRARIES
|
||||
|
||||
install-html: install-html-am
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-am
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-am
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-am
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-am
|
||||
-rm -rf ./$(DEPDIR)
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic
|
||||
|
||||
mostlyclean: mostlyclean-am
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-am
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-am
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am: uninstall-libLTLIBRARIES \
|
||||
uninstall-libffts_includeHEADERS
|
||||
|
||||
.MAKE: install-am install-strip
|
||||
|
||||
.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
|
||||
clean-libLTLIBRARIES clean-libtool cscopelist-am ctags \
|
||||
ctags-am distclean distclean-compile distclean-generic \
|
||||
distclean-libtool distclean-tags distdir dvi dvi-am html \
|
||||
html-am info info-am install install-am install-data \
|
||||
install-data-am install-dvi install-dvi-am install-exec \
|
||||
install-exec-am install-html install-html-am install-info \
|
||||
install-info-am install-libLTLIBRARIES \
|
||||
install-libffts_includeHEADERS install-man install-pdf \
|
||||
install-pdf-am install-ps install-ps-am install-strip \
|
||||
installcheck installcheck-am installdirs maintainer-clean \
|
||||
maintainer-clean-generic mostlyclean mostlyclean-compile \
|
||||
mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
|
||||
tags tags-am uninstall uninstall-am uninstall-libLTLIBRARIES \
|
||||
uninstall-libffts_includeHEADERS
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
732
3rdparty/ffts/ffts-master/src/codegen.c
vendored
Normal file
732
3rdparty/ffts/ffts-master/src/codegen.c
vendored
Normal file
@@ -0,0 +1,732 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "codegen.h"
|
||||
#include "macros.h"
|
||||
#include "ffts.h"
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <libkern/OSCacheControl.h>
|
||||
#endif
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#include "codegen_arm.h"
|
||||
#include "neon.h"
|
||||
#elif HAVE_VFP
|
||||
#include "codegen_arm.h"
|
||||
#include "vfp.h"
|
||||
#else
|
||||
#include "codegen_sse.h"
|
||||
#include "macros-sse.h"
|
||||
#endif
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
int tree_count(int N, int leafN, int offset) {
|
||||
|
||||
if(N <= leafN) return 0;
|
||||
int count = 0;
|
||||
count += tree_count(N/4, leafN, offset);
|
||||
count += tree_count(N/8, leafN, offset + N/4);
|
||||
count += tree_count(N/8, leafN, offset + N/4 + N/8);
|
||||
count += tree_count(N/4, leafN, offset + N/2);
|
||||
count += tree_count(N/4, leafN, offset + 3*N/4);
|
||||
|
||||
return 1 + count;
|
||||
}
|
||||
|
||||
void elaborate_tree(size_t **p, int N, int leafN, int offset) {
|
||||
|
||||
if(N <= leafN) return;
|
||||
elaborate_tree(p, N/4, leafN, offset);
|
||||
elaborate_tree(p, N/8, leafN, offset + N/4);
|
||||
elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
|
||||
elaborate_tree(p, N/4, leafN, offset + N/2);
|
||||
elaborate_tree(p, N/4, leafN, offset + 3*N/4);
|
||||
|
||||
(*p)[0] = N;
|
||||
(*p)[1] = offset*2;
|
||||
|
||||
(*p)+=2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uint32_t LUT_offset(size_t N, size_t leafN) {
|
||||
int i;
|
||||
size_t p_lut_size = 0;
|
||||
size_t lut_size = 0;
|
||||
int hardcoded = 0;
|
||||
size_t n_luts = __builtin_ctzl(N/leafN);
|
||||
int n = leafN*2;
|
||||
//if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
|
||||
|
||||
for(i=0;i<n_luts-1;i++) {
|
||||
p_lut_size = lut_size;
|
||||
if(!i || hardcoded) {
|
||||
#ifdef __arm__
|
||||
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
|
||||
else lut_size += n/4 * sizeof(cdata_t);
|
||||
#else
|
||||
lut_size += n/4 * 2 * sizeof(cdata_t);
|
||||
#endif
|
||||
// n *= 2;
|
||||
} else {
|
||||
#ifdef __arm__
|
||||
lut_size += n/8 * 3 * sizeof(cdata_t);
|
||||
#else
|
||||
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
|
||||
#endif
|
||||
}
|
||||
n *= 2;
|
||||
}
|
||||
return lut_size;
|
||||
}
|
||||
|
||||
#ifdef __arm__
|
||||
typedef uint32_t insns_t;
|
||||
#else
|
||||
typedef uint8_t insns_t;
|
||||
#endif
|
||||
|
||||
#define P(x) (*(*p)++ = x)
|
||||
|
||||
void insert_nops(uint8_t **p, uint32_t count) {
|
||||
switch(count) {
|
||||
case 0: break;
|
||||
case 2: P(0x66);
|
||||
case 1: P(0x90); break;
|
||||
case 3: P(0x0F); P(0x1F); P(0x00); break;
|
||||
case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;
|
||||
case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
|
||||
case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
|
||||
case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;
|
||||
case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
|
||||
case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
|
||||
default:
|
||||
P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00);
|
||||
insert_nops(p, count-9);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void align_mem16(uint8_t **p, uint32_t offset) {
|
||||
#ifdef __x86_64__
|
||||
int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
|
||||
r = (16 + r) & 0xf;
|
||||
insert_nops(p, r);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
|
||||
int count = tree_count(N, leafN, 0) + 1;
|
||||
size_t *ps = malloc(count * 2 * sizeof(size_t));
|
||||
size_t *pps = ps;
|
||||
|
||||
#ifdef __x86_64__
|
||||
if(sign < 0) p->constants = sse_constants;
|
||||
else p->constants = sse_constants_inv;
|
||||
#endif
|
||||
|
||||
elaborate_tree(&pps, N, leafN, 0);
|
||||
pps[0] = 0;
|
||||
pps[1] = 0;
|
||||
|
||||
pps = ps;
|
||||
|
||||
#ifdef __arm__
|
||||
if(N < 8192) p->transform_size = 8192;
|
||||
else p->transform_size = N;
|
||||
#else
|
||||
if(N < 2048) p->transform_size = 16384;
|
||||
else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
|
||||
#else
|
||||
#define MAP_ANONYMOUS 0x20
|
||||
p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
||||
#endif
|
||||
|
||||
/*
|
||||
if(p->transform_base == MAP_FAILED) {
|
||||
fprintf(stderr, "MAP FAILED\n");
|
||||
exit(1);
|
||||
}*/
|
||||
insns_t *func = p->transform_base;//valloc(8192);
|
||||
insns_t *fp = func;
|
||||
|
||||
//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
|
||||
//fprintf(stderr, "Base address = %016p\n", func);
|
||||
|
||||
if(!func) {
|
||||
fprintf(stderr, "NOMEM\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
insns_t *x_8_addr = fp;
|
||||
#ifdef __arm__
|
||||
#ifdef HAVE_NEON
|
||||
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
|
||||
/*
|
||||
* Changes adds to subtracts and vice versa to allow the computation
|
||||
* of both the IFFT and FFT
|
||||
*/
|
||||
if(sign < 0) {
|
||||
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
|
||||
fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
|
||||
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_x8_t - neon_x8) / 4;
|
||||
#else
|
||||
memcpy(fp, vfp_x8, vfp_end - vfp_x8);
|
||||
if(sign > 0) {
|
||||
fp[65] ^= 0x00000040;
|
||||
fp[66] ^= 0x00000040;
|
||||
fp[68] ^= 0x00000040;
|
||||
fp[70] ^= 0x00000040;
|
||||
fp[103] ^= 0x00000040;
|
||||
fp[104] ^= 0x00000040;
|
||||
fp[105] ^= 0x00000040;
|
||||
fp[108] ^= 0x00000040;
|
||||
fp[113] ^= 0x00000040;
|
||||
fp[114] ^= 0x00000040;
|
||||
fp[117] ^= 0x00000040;
|
||||
fp[118] ^= 0x00000040;
|
||||
}
|
||||
fp += (vfp_end - vfp_x8) / 4;
|
||||
#endif
|
||||
#else
|
||||
align_mem16(&fp, 0);
|
||||
x_8_addr = fp;
|
||||
align_mem16(&fp, 5);
|
||||
memcpy(fp, x8_soft, x8_hard - x8_soft);
|
||||
fp += (x8_hard - x8_soft);
|
||||
//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
|
||||
#endif
|
||||
//uint32_t *x_8_t_addr = fp;
|
||||
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
|
||||
//fp += (neon_end - neon_x8_t) / 4;
|
||||
insns_t *x_4_addr = fp;
|
||||
#ifdef __arm__
|
||||
#ifdef HAVE_NEON
|
||||
memcpy(fp, neon_x4, neon_x8 - neon_x4);
|
||||
if(sign < 0) {
|
||||
fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_x8 - neon_x4) / 4;
|
||||
#else
|
||||
memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
|
||||
if(sign > 0) {
|
||||
fp[36] ^= 0x00000040;
|
||||
fp[38] ^= 0x00000040;
|
||||
fp[43] ^= 0x00000040;
|
||||
fp[44] ^= 0x00000040;
|
||||
}
|
||||
fp += (vfp_x8 - vfp_x4) / 4;
|
||||
#endif
|
||||
#else
|
||||
align_mem16(&fp, 0);
|
||||
x_4_addr = fp;
|
||||
memcpy(fp, x4, x8_soft - x4);
|
||||
fp += (x8_soft - x4);
|
||||
|
||||
#endif
|
||||
insns_t *start = fp;
|
||||
|
||||
#ifdef __arm__
|
||||
*fp = PUSH_LR(); fp++;
|
||||
*fp = 0xed2d8b10; fp++;
|
||||
|
||||
ADDI(&fp, 3, 1, 0);
|
||||
ADDI(&fp, 7, 1, N);
|
||||
ADDI(&fp, 5, 1, 2*N);
|
||||
ADDI(&fp, 10, 7, 2*N);
|
||||
ADDI(&fp, 4, 5, 2*N);
|
||||
ADDI(&fp, 8, 10, 2*N);
|
||||
ADDI(&fp, 6, 4, 2*N);
|
||||
ADDI(&fp, 9, 8, 2*N);
|
||||
|
||||
*fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
|
||||
// *fp++ = LDRI(1, 0, 4); // load ws into r1
|
||||
ADDI(&fp, 1, 0, 0);
|
||||
|
||||
ADDI(&fp, 0, 2, 0), // mov out into r0
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __arm__
|
||||
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
|
||||
#ifdef HAVE_NEON
|
||||
MOVI(&fp, 11, p->i0);
|
||||
#else
|
||||
MOVI(&fp, 11, p->i0);
|
||||
#endif
|
||||
|
||||
#else
|
||||
align_mem16(&fp, 0);
|
||||
start = fp;
|
||||
|
||||
*fp++ = 0x4c;
|
||||
*fp++ = 0x8b;
|
||||
*fp++ = 0x07;
|
||||
uint32_t lp_cnt = p->i0 * 4;
|
||||
MOVI(&fp, RCX, lp_cnt);
|
||||
|
||||
//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
|
||||
#endif
|
||||
//fp++;
|
||||
#ifdef __arm__
|
||||
#ifdef HAVE_NEON
|
||||
memcpy(fp, neon_ee, neon_oo - neon_ee);
|
||||
if(sign < 0) {
|
||||
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
|
||||
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
|
||||
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_oo - neon_ee) / 4;
|
||||
#else
|
||||
memcpy(fp, vfp_e, vfp_o - vfp_e);
|
||||
if(sign > 0) {
|
||||
fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
|
||||
fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
|
||||
fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
|
||||
}
|
||||
fp += (vfp_o - vfp_e) / 4;
|
||||
#endif
|
||||
#else
|
||||
//fprintf(stderr, "Body start address = %016p\n", start);
|
||||
|
||||
PUSH(&fp, RBP);
|
||||
PUSH(&fp, RBX);
|
||||
PUSH(&fp, R10);
|
||||
PUSH(&fp, R11);
|
||||
PUSH(&fp, R12);
|
||||
PUSH(&fp, R13);
|
||||
PUSH(&fp, R14);
|
||||
PUSH(&fp, R15);
|
||||
|
||||
int i;
|
||||
memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
|
||||
|
||||
//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
|
||||
//fprintf(stderr, "Constants address = %016p\n", sse_constants);
|
||||
//fprintf(stderr, "Constants address = %016p\n", p->constants);
|
||||
|
||||
//int32_t val = READ_IMM32(fp + 3);
|
||||
//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
|
||||
|
||||
//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
|
||||
//fprintf(stderr, "IMM = 0x%llx\n", v2);
|
||||
|
||||
//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp ));
|
||||
fp += (leaf_ee - leaf_ee_init);
|
||||
|
||||
//fprintf(stderr, "Leaf start address = %016p\n", fp);
|
||||
align_mem16(&fp, 9);
|
||||
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
|
||||
|
||||
|
||||
uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
|
||||
uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
|
||||
uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
|
||||
|
||||
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4);
|
||||
|
||||
fp += (leaf_oo - leaf_ee);
|
||||
|
||||
if(__builtin_ctzl(N) & 1){
|
||||
|
||||
if(p->i1) {
|
||||
lp_cnt += p->i1 * 4;
|
||||
MOVI(&fp, RCX, lp_cnt);
|
||||
align_mem16(&fp, 4);
|
||||
memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
|
||||
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
|
||||
fp += (leaf_eo - leaf_oo);
|
||||
}
|
||||
|
||||
|
||||
memcpy(fp, leaf_oe, leaf_end - leaf_oe);
|
||||
lp_cnt += 4;
|
||||
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4);
|
||||
fp += (leaf_end - leaf_oe);
|
||||
|
||||
}else{
|
||||
|
||||
|
||||
memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
|
||||
lp_cnt += 4;
|
||||
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4);
|
||||
fp += (leaf_oe - leaf_eo);
|
||||
|
||||
if(p->i1) {
|
||||
lp_cnt += p->i1 * 4;
|
||||
MOVI(&fp, RCX, lp_cnt);
|
||||
align_mem16(&fp, 4);
|
||||
memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
|
||||
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
|
||||
fp += (leaf_eo - leaf_oo);
|
||||
}
|
||||
|
||||
}
|
||||
if(p->i1) {
|
||||
lp_cnt += p->i1 * 4;
|
||||
MOVI(&fp, RCX, lp_cnt);
|
||||
align_mem16(&fp, 9);
|
||||
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
|
||||
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4);
|
||||
fp += (leaf_oo - leaf_ee);
|
||||
|
||||
}
|
||||
|
||||
//fprintf(stderr, "Body start address = %016p\n", fp);
|
||||
//LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
|
||||
memcpy(fp, x_init, x4 - x_init);
|
||||
//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
|
||||
fp += (x4 - x_init);
|
||||
|
||||
int32_t pAddr = 0;
|
||||
int32_t pN = 0;
|
||||
int32_t pLUT = 0;
|
||||
count = 2;
|
||||
while(pps[0]) {
|
||||
|
||||
if(!pN) {
|
||||
MOVI(&fp, RCX, pps[0] / 4);
|
||||
}else{
|
||||
if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
|
||||
if(pps[0] > leafN && pps[0] - pN) {
|
||||
|
||||
int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
|
||||
*fp++ = 0xc1;
|
||||
|
||||
if(diff > 0) {
|
||||
*fp++ = 0xe1;
|
||||
*fp++ = (diff & 0xff);
|
||||
}else{
|
||||
*fp++ = 0xe9;
|
||||
*fp++ = ((-diff) & 0xff);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
|
||||
ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
|
||||
|
||||
|
||||
if(pps[0] == 2*leafN) {
|
||||
CALL(&fp, x_4_addr);
|
||||
// }else if(!pps[2]){
|
||||
// //uint32_t *x_8_t_addr = fp;
|
||||
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
|
||||
// fp += (neon_ee - neon_x8_t) / 4;
|
||||
// //*fp++ = BL(fp+2, x_8_t_addr);
|
||||
}else{
|
||||
CALL(&fp, x_8_addr);
|
||||
}
|
||||
|
||||
pAddr = pps[1] * 4;
|
||||
if(pps[0] > leafN)
|
||||
pN = pps[0];
|
||||
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
|
||||
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
|
||||
count += 4;
|
||||
pps += 2;
|
||||
}
|
||||
#endif
|
||||
#ifdef __arm__
|
||||
#ifdef HAVE_NEON
|
||||
if(__builtin_ctzl(N) & 1){
|
||||
ADDI(&fp, 2, 7, 0);
|
||||
ADDI(&fp, 7, 9, 0);
|
||||
ADDI(&fp, 9, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 8, 0);
|
||||
ADDI(&fp, 8, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
if(p->i1) {
|
||||
MOVI(&fp, 11, p->i1);
|
||||
memcpy(fp, neon_oo, neon_eo - neon_oo);
|
||||
if(sign < 0) {
|
||||
fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
|
||||
fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
|
||||
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_eo - neon_oo) / 4;
|
||||
}
|
||||
|
||||
*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++;
|
||||
|
||||
memcpy(fp, neon_oe, neon_end - neon_oe);
|
||||
if(sign < 0) {
|
||||
fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
|
||||
fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
|
||||
fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_end - neon_oe) / 4;
|
||||
|
||||
}else{
|
||||
|
||||
*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++;
|
||||
|
||||
memcpy(fp, neon_eo, neon_oe - neon_eo);
|
||||
if(sign < 0) {
|
||||
fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
|
||||
fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
|
||||
fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_oe - neon_eo) / 4;
|
||||
|
||||
ADDI(&fp, 2, 7, 0);
|
||||
ADDI(&fp, 7, 9, 0);
|
||||
ADDI(&fp, 9, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 8, 0);
|
||||
ADDI(&fp, 8, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
if(p->i1) {
|
||||
MOVI(&fp, 11, p->i1);
|
||||
memcpy(fp, neon_oo, neon_eo - neon_oo);
|
||||
if(sign < 0) {
|
||||
fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
|
||||
fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
|
||||
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_eo - neon_oo) / 4;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if(p->i1) {
|
||||
ADDI(&fp, 2, 3, 0);
|
||||
ADDI(&fp, 3, 7, 0);
|
||||
ADDI(&fp, 7, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 4, 0);
|
||||
ADDI(&fp, 4, 8, 0);
|
||||
ADDI(&fp, 8, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 5, 0);
|
||||
ADDI(&fp, 5, 9, 0);
|
||||
ADDI(&fp, 9, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 6, 0);
|
||||
ADDI(&fp, 6, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 9, 0);
|
||||
ADDI(&fp, 9, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
|
||||
MOVI(&fp, 11, p->i1);
|
||||
memcpy(fp, neon_ee, neon_oo - neon_ee);
|
||||
if(sign < 0) {
|
||||
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
|
||||
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
|
||||
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_oo - neon_ee) / 4;
|
||||
|
||||
}
|
||||
#else
|
||||
ADDI(&fp, 2, 7, 0);
|
||||
ADDI(&fp, 7, 9, 0);
|
||||
ADDI(&fp, 9, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 8, 0);
|
||||
ADDI(&fp, 8, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
|
||||
memcpy(fp, vfp_o, vfp_x4 - vfp_o);
|
||||
if(sign > 0) {
|
||||
fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
|
||||
fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
|
||||
}
|
||||
fp += (vfp_x4 - vfp_o) / 4;
|
||||
|
||||
ADDI(&fp, 2, 3, 0);
|
||||
ADDI(&fp, 3, 7, 0);
|
||||
ADDI(&fp, 7, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 4, 0);
|
||||
ADDI(&fp, 4, 8, 0);
|
||||
ADDI(&fp, 8, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 5, 0);
|
||||
ADDI(&fp, 5, 9, 0);
|
||||
ADDI(&fp, 9, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 6, 0);
|
||||
ADDI(&fp, 6, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
ADDI(&fp, 2, 9, 0);
|
||||
ADDI(&fp, 9, 10, 0);
|
||||
ADDI(&fp, 10, 2, 0);
|
||||
|
||||
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
|
||||
MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
|
||||
memcpy(fp, vfp_e, vfp_o - vfp_e);
|
||||
if(sign > 0) {
|
||||
fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
|
||||
fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
|
||||
fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
|
||||
}
|
||||
fp += (vfp_o - vfp_e) / 4;
|
||||
|
||||
#endif
|
||||
*fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
|
||||
//ADDI(&fp, 2, 1, 0);
|
||||
MOVI(&fp, 1, 0);
|
||||
|
||||
// args: r0 - out
|
||||
// r1 - N
|
||||
// r2 - ws
|
||||
// ADDI(&fp, 3, 1, 0); // put N into r3 for counter
|
||||
|
||||
int32_t pAddr = 0;
|
||||
int32_t pN = 0;
|
||||
int32_t pLUT = 0;
|
||||
count = 2;
|
||||
while(pps[0]) {
|
||||
|
||||
// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
|
||||
if(!pN) {
|
||||
MOVI(&fp, 1, pps[0]);
|
||||
}else{
|
||||
if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
|
||||
if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
|
||||
}
|
||||
|
||||
if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
|
||||
ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
|
||||
|
||||
|
||||
if(pps[0] == 2*leafN) {
|
||||
*fp = BL(fp+2, x_4_addr); fp++;
|
||||
}else if(!pps[2]){
|
||||
//uint32_t *x_8_t_addr = fp;
|
||||
#ifdef HAVE_NEON
|
||||
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
|
||||
if(sign < 0) {
|
||||
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
|
||||
fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
|
||||
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
|
||||
}
|
||||
fp += (neon_ee - neon_x8_t) / 4;
|
||||
//*fp++ = BL(fp+2, x_8_t_addr);
|
||||
|
||||
#else
|
||||
*fp = BL(fp+2, x_8_addr); fp++;
|
||||
#endif
|
||||
}else{
|
||||
*fp = BL(fp+2, x_8_addr); fp++;
|
||||
}
|
||||
|
||||
pAddr = pps[1] * 4;
|
||||
pN = pps[0];
|
||||
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
|
||||
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
|
||||
count += 4;
|
||||
pps += 2;
|
||||
}
|
||||
|
||||
*fp++ = 0xecbd8b10;
|
||||
*fp++ = POP_LR(); count++;
|
||||
#else
|
||||
POP(&fp, R15);
|
||||
POP(&fp, R14);
|
||||
POP(&fp, R13);
|
||||
POP(&fp, R12);
|
||||
POP(&fp, R11);
|
||||
POP(&fp, R10);
|
||||
POP(&fp, RBX);
|
||||
POP(&fp, RBP);
|
||||
RET(&fp);
|
||||
|
||||
|
||||
//uint8_t *pp = func;
|
||||
//int counter = 0;
|
||||
//do{
|
||||
// printf("%02x ", *pp);
|
||||
// if(counter++ % 16 == 15) printf("\n");
|
||||
//} while(++pp < fp);
|
||||
|
||||
//printf("\n");
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// *fp++ = B(14); count++;
|
||||
|
||||
//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
|
||||
// fprintf(stderr, "%08x\n", x_4_addr[i]);
|
||||
//fprintf(stderr, "\n");
|
||||
//for(int i=0;i<count;i++)
|
||||
|
||||
free(ps);
|
||||
|
||||
if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
|
||||
perror("Couldn't mprotect");
|
||||
exit(1);
|
||||
}
|
||||
#ifdef __APPLE__
|
||||
sys_icache_invalidate(func, p->transform_size);
|
||||
#elif __ANDROID__
|
||||
cacheflush((long)(func), (long)(func) + p->transform_size, 0);
|
||||
#elif __linux__
|
||||
#ifdef __GNUC__
|
||||
__clear_cache((long)(func), (long)(func) + p->transform_size);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
|
||||
|
||||
p->transform = (void *) (start);
|
||||
}
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
50
3rdparty/ffts/ffts-master/src/codegen.h
vendored
Normal file
50
3rdparty/ffts/ffts-master/src/codegen.h
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __CODEGEN_H__
|
||||
#define __CODEGEN_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <sys/mman.h>
|
||||
#include <string.h>
|
||||
#include <limits.h> /* for PAGESIZE */
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign);
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
102
3rdparty/ffts/ffts-master/src/codegen_arm.h
vendored
Normal file
102
3rdparty/ffts/ffts-master/src/codegen_arm.h
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __CODEGEN_ARM_H__
|
||||
#define __CODEGEN_ARM_H__
|
||||
|
||||
|
||||
|
||||
uint32_t BL(void *pos, void *target) {
|
||||
return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
|
||||
}
|
||||
|
||||
uint32_t B(uint8_t r) {
|
||||
return 0xe12fff10 | r;
|
||||
}
|
||||
|
||||
uint32_t MOV(uint8_t dst, uint8_t src) {
|
||||
return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
|
||||
}
|
||||
|
||||
void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
|
||||
int32_t oimm = imm;
|
||||
if(imm < 0) {
|
||||
imm = -imm;
|
||||
uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
|
||||
if(shamt & 1) shamt -= 1;
|
||||
imm >>= shamt;
|
||||
shamt = (32 - shamt)/2;
|
||||
|
||||
// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
|
||||
*(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
|
||||
|
||||
if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
|
||||
|
||||
}else{
|
||||
uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
|
||||
if(shamt & 1) shamt -= 1;
|
||||
imm >>= shamt;
|
||||
shamt = (32 - shamt)/2;
|
||||
|
||||
// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
|
||||
|
||||
*(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
|
||||
|
||||
if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
|
||||
return 0xe5900000 | ((dst & 0xf) << 12)
|
||||
| ((base & 0xf) << 16) | (offset & 0xfff) ;
|
||||
}
|
||||
|
||||
void MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
|
||||
uint32_t oimm = imm;
|
||||
|
||||
uint32_t shamt = (__builtin_ctzl(imm)>23)?23:__builtin_ctzl(imm);
|
||||
if(shamt & 1) shamt -= 1;
|
||||
imm >>= shamt;
|
||||
shamt = (32 - shamt)/2;
|
||||
*(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
|
||||
if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
|
||||
}
|
||||
|
||||
uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
|
||||
uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
196
3rdparty/ffts/ffts-master/src/codegen_sse.h
vendored
Normal file
196
3rdparty/ffts/ffts-master/src/codegen_sse.h
vendored
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __CODEGEN_SSE_H__
|
||||
#define __CODEGEN_SSE_H__
|
||||
|
||||
void neon_x4(float *, size_t, float *);
|
||||
void neon_x8(float *, size_t, float *);
|
||||
void neon_x8_t(float *, size_t, float *);
|
||||
void leaf_ee_init();
|
||||
void leaf_ee();
|
||||
void leaf_oo();
|
||||
void leaf_eo();
|
||||
void leaf_oe();
|
||||
void leaf_end();
|
||||
void x_init();
|
||||
void x4();
|
||||
void x8_soft();
|
||||
void x8_hard();
|
||||
|
||||
void sse_constants();
|
||||
void sse_constants_inv();
|
||||
|
||||
// typedef uint8_t insns_t;
|
||||
|
||||
extern const uint32_t sse_leaf_ee_offsets[8];
|
||||
extern const uint32_t sse_leaf_oo_offsets[8];
|
||||
extern const uint32_t sse_leaf_eo_offsets[8];
|
||||
extern const uint32_t sse_leaf_oe_offsets[8];
|
||||
|
||||
#define EAX 0
|
||||
#define ECX 1
|
||||
#define EDX 2
|
||||
#define EBX 3
|
||||
#define ESI 6
|
||||
#define EDI 7
|
||||
#define EBP 5
|
||||
|
||||
#define RAX 0
|
||||
#define RCX 1
|
||||
#define RDX 2
|
||||
#define RBX 3
|
||||
#define RSI 6
|
||||
#define RDI 7
|
||||
#define RBP 5
|
||||
#define R8 8
|
||||
#define R9 9
|
||||
#define R10 10
|
||||
#define R11 11
|
||||
#define R12 12
|
||||
#define R13 13
|
||||
#define R14 14
|
||||
#define R15 15
|
||||
|
||||
void IMM8(uint8_t **p, int32_t imm) {
|
||||
*(*p)++ = (imm & 0xff);
|
||||
}
|
||||
|
||||
void IMM16(uint8_t **p, int32_t imm) {
|
||||
int i;
|
||||
for(i=0;i<2;i++) {
|
||||
*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
|
||||
}
|
||||
}
|
||||
void IMM32(uint8_t **p, int32_t imm) {
|
||||
int i;
|
||||
for(i=0;i<4;i++) {
|
||||
*(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
|
||||
}
|
||||
}
|
||||
void IMM32_NI(uint8_t *p, int32_t imm) {
|
||||
int i;
|
||||
for(i=0;i<4;i++) {
|
||||
*(p+i) = (imm & (0xff << (i*8))) >> (i*8);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t READ_IMM32(uint8_t *p) {
|
||||
int32_t rval = 0;
|
||||
int i;
|
||||
for(i=0;i<4;i++) {
|
||||
rval |= *(p+i) << (i*8);
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
|
||||
// if(imm < 65536) *(*p)++ = 0x66;
|
||||
if(dst >= 8) *(*p)++ = 0x41;
|
||||
|
||||
//if(imm < 65536 && imm >= 256) *(*p)++ = 0x66;
|
||||
|
||||
//if(imm >= 256)
|
||||
*(*p)++ = 0xb8 | (dst & 0x7);
|
||||
// else *(*p)++ = 0xb0 | (dst & 0x7);
|
||||
|
||||
// if(imm < 256) IMM8(p, imm);
|
||||
// else
|
||||
//if(imm < 65536) IMM16(p, imm);
|
||||
//else
|
||||
IMM32(p, imm);
|
||||
|
||||
//if(dst < 8) {
|
||||
// *(*p)++ = 0xb8 + dst;
|
||||
//}else{
|
||||
// *(*p)++ = 0x49;
|
||||
// *(*p)++ = 0xc7;
|
||||
// *(*p)++ = 0xc0 | (dst - 8);
|
||||
//}
|
||||
//IMM32(p, imm);
|
||||
}
|
||||
|
||||
void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
|
||||
if(disp == 0) {
|
||||
*(*p)++ = (rm & 7) | ((reg & 7) << 3);
|
||||
}else if(disp <= 127 || disp >= -128) {
|
||||
*(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
|
||||
IMM8(p, disp);
|
||||
}else{
|
||||
*(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
|
||||
IMM32(p, disp);
|
||||
}
|
||||
}
|
||||
|
||||
void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
|
||||
|
||||
*(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
|
||||
*(*p)++ = 0x8d;
|
||||
ADDRMODE(p, dst, base, disp);
|
||||
}
|
||||
|
||||
void RET(uint8_t **p) {
|
||||
*(*p)++ = 0xc3;
|
||||
}
|
||||
|
||||
void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
|
||||
|
||||
if(dst >= 8) *(*p)++ = 0x49;
|
||||
else *(*p)++ = 0x48;
|
||||
|
||||
if(imm > 127 || imm <= -128) *(*p)++ = 0x81;
|
||||
else *(*p)++ = 0x83;
|
||||
|
||||
*(*p)++ = 0xc0 | (dst & 0x7);
|
||||
|
||||
if(imm > 127 || imm <= -128) IMM32(p, imm);
|
||||
else IMM8(p, imm);
|
||||
}
|
||||
|
||||
void CALL(uint8_t **p, uint8_t *func) {
|
||||
*(*p)++ = 0xe8;
|
||||
IMM32(p, ((void *)func) - (void *)(*p) - 4);
|
||||
}
|
||||
|
||||
void PUSH(uint8_t **p, uint8_t reg) {
|
||||
if(reg >= 8) *(*p)++ = 0x41;
|
||||
*(*p)++ = 0x50 | (reg & 7);
|
||||
}
|
||||
void POP(uint8_t **p, uint8_t reg) {
|
||||
if(reg >= 8) *(*p)++ = 0x41;
|
||||
*(*p)++ = 0x58 | (reg & 7);
|
||||
}
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
416
3rdparty/ffts/ffts-master/src/ffts.c
vendored
Normal file
416
3rdparty/ffts/ffts-master/src/ffts.c
vendored
Normal file
@@ -0,0 +1,416 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#include "ffts.h"
|
||||
#include "macros.h"
|
||||
//#include "mini_macros.h"
|
||||
#include "patterns.h"
|
||||
#include "ffts_small.h"
|
||||
|
||||
#ifdef DYNAMIC_DISABLED
|
||||
#include "ffts_static.h"
|
||||
#else
|
||||
#include "codegen.h"
|
||||
#endif
|
||||
|
||||
#include <errno.h>
|
||||
#include <sys/mman.h>
|
||||
#include <string.h>
|
||||
#include <limits.h> /* for PAGESIZE */
|
||||
|
||||
#if __APPLE__
|
||||
#include <libkern/OSCacheControl.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
void ffts_execute(ffts_plan_t *p, const void * in, void * out) {
|
||||
|
||||
//TODO: Define NEEDS_ALIGNED properly instead
|
||||
#if defined(HAVE_SSE) || defined(HAVE_NEON)
|
||||
if(((int)in % 16) != 0) {
|
||||
LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
|
||||
}
|
||||
|
||||
if(((int)out % 16) != 0) {
|
||||
LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
p->transform(p, (const float *)in, (float *)out);
|
||||
}
|
||||
|
||||
void ffts_free(ffts_plan_t *p) {
|
||||
p->destroy(p);
|
||||
}
|
||||
|
||||
void ffts_free_1d(ffts_plan_t *p) {
|
||||
|
||||
size_t i;
|
||||
|
||||
if(p->ws) {
|
||||
FFTS_FREE(p->ws);
|
||||
}
|
||||
if(p->is) free(p->is);
|
||||
if(p->ws_is) free(p->ws_is);
|
||||
if(p->offsets) free(p->offsets);
|
||||
//free(p->transforms);
|
||||
if(p->transforms) free(p->transforms);
|
||||
|
||||
if(p->transform_base) {
|
||||
if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
|
||||
perror("Couldn't mprotect");
|
||||
exit(errno);
|
||||
}
|
||||
munmap(p->transform_base, p->transform_size);
|
||||
//free(p->transform_base);
|
||||
}
|
||||
free(p);
|
||||
}
|
||||
|
||||
ffts_plan_t *ffts_init_1d(size_t N, int sign) {
|
||||
if(N == 0 || (N & (N - 1)) != 0){
|
||||
LOG("FFT size must be a power of two\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
|
||||
size_t leafN = 8;
|
||||
size_t i;
|
||||
|
||||
#ifdef __arm__
|
||||
//#ifdef HAVE_NEON
|
||||
V MULI_SIGN;
|
||||
|
||||
if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
|
||||
else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
|
||||
//#endif
|
||||
#else
|
||||
V MULI_SIGN;
|
||||
|
||||
if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
|
||||
else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
|
||||
#endif
|
||||
|
||||
p->transform = NULL;
|
||||
p->transform_base = NULL;
|
||||
p->transforms = NULL;
|
||||
p->is = NULL;
|
||||
p->ws_is = NULL;
|
||||
p->ws = NULL;
|
||||
p->offsets = NULL;
|
||||
p->destroy = ffts_free_1d;
|
||||
|
||||
if(N >= 32) {
|
||||
ffts_init_offsets(p, N, leafN);
|
||||
#ifdef __arm__
|
||||
#ifdef HAVE_NEON
|
||||
ffts_init_is(p, N, leafN, 1);
|
||||
#else
|
||||
ffts_init_is(p, N, leafN, 1);
|
||||
#endif
|
||||
#else
|
||||
ffts_init_is(p, N, leafN, 1);
|
||||
#endif
|
||||
|
||||
p->i0 = N/leafN/3+1;
|
||||
p->i1 = N/leafN/3;
|
||||
if((N/leafN) % 3 > 1) p->i1++;
|
||||
p->i2 = N/leafN/3;
|
||||
|
||||
#ifdef __arm__
|
||||
#ifdef HAVE_NEON
|
||||
p->i0/=2;
|
||||
p->i1/=2;
|
||||
#endif
|
||||
#else
|
||||
p->i0/=2;
|
||||
p->i1/=2;
|
||||
#endif
|
||||
|
||||
}else{
|
||||
p->transforms = malloc(2 * sizeof(transform_index_t));
|
||||
p->transforms[0] = 0;
|
||||
p->transforms[1] = 1;
|
||||
if(N == 2) p->transform = &firstpass_2;
|
||||
else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
|
||||
else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
|
||||
else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
|
||||
else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
|
||||
else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
|
||||
else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
|
||||
|
||||
p->is = NULL;
|
||||
p->offsets = NULL;
|
||||
}
|
||||
|
||||
int hardcoded = 0;
|
||||
|
||||
/* LUTS */
|
||||
size_t n_luts = __builtin_ctzl(N/leafN);
|
||||
if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
|
||||
|
||||
if(n_luts >= 32) n_luts = 0;
|
||||
|
||||
// fprintf(stderr, "n_luts = %zu\n", n_luts);
|
||||
|
||||
cdata_t *w;
|
||||
|
||||
int n = leafN*2;
|
||||
if(hardcoded) n = 8;
|
||||
|
||||
size_t lut_size = 0;
|
||||
|
||||
for(i=0;i<n_luts;i++) {
|
||||
if(!i || hardcoded) {
|
||||
#ifdef __arm__
|
||||
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
|
||||
else lut_size += n/4 * sizeof(cdata_t);
|
||||
#else
|
||||
lut_size += n/4 * 2 * sizeof(cdata_t);
|
||||
#endif
|
||||
n *= 2;
|
||||
} else {
|
||||
#ifdef __arm__
|
||||
lut_size += n/8 * 3 * sizeof(cdata_t);
|
||||
#else
|
||||
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
|
||||
#endif
|
||||
}
|
||||
n *= 2;
|
||||
}
|
||||
|
||||
// lut_size *= 16;
|
||||
|
||||
// fprintf(stderr, "lut size = %zu\n", lut_size);
|
||||
if(n_luts) {
|
||||
p->ws = FFTS_MALLOC(lut_size,32);
|
||||
p->ws_is = malloc(n_luts * sizeof(size_t));
|
||||
}else{
|
||||
p->ws = NULL;
|
||||
p->ws_is = NULL;
|
||||
}
|
||||
w = p->ws;
|
||||
|
||||
n = leafN*2;
|
||||
if(hardcoded) n = 8;
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
|
||||
#endif
|
||||
|
||||
for(i=0;i<n_luts;i++) {
|
||||
p->ws_is[i] = w - (cdata_t *)p->ws;
|
||||
//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
|
||||
|
||||
if(!i || hardcoded) {
|
||||
cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
|
||||
|
||||
size_t j;
|
||||
for(j=0;j<n/4;j++) {
|
||||
w0[j][0] = W_re(n,j);
|
||||
w0[j][1] = W_im(n,j);
|
||||
}
|
||||
|
||||
|
||||
float *fw0 = (float *)w0;
|
||||
#ifdef __arm__
|
||||
if(N < 32) {
|
||||
//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
|
||||
float *fw = (float *)w;
|
||||
V temp0, temp1, temp2;
|
||||
for(j=0;j<n/4;j+=2) {
|
||||
// #ifdef HAVE_NEON
|
||||
temp0 = VLD(fw0 + j*2);
|
||||
V re, im;
|
||||
re = VDUPRE(temp0);
|
||||
im = VDUPIM(temp0);
|
||||
#ifdef HAVE_NEON
|
||||
im = VXOR(im, MULI_SIGN);
|
||||
//im = IMULI(sign>0, im);
|
||||
#else
|
||||
im = MULI(sign>0, im);
|
||||
#endif
|
||||
VST(fw + j*4 , re);
|
||||
VST(fw + j*4+4, im);
|
||||
// #endif
|
||||
}
|
||||
w += n/4 * 2;
|
||||
}else{
|
||||
//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
|
||||
float *fw = (float *)w;
|
||||
#ifdef HAVE_NEON
|
||||
VS temp0, temp1, temp2;
|
||||
for(j=0;j<n/4;j+=4) {
|
||||
temp0 = VLD2(fw0 + j*2);
|
||||
temp0.val[1] = VXOR(temp0.val[1], neg);
|
||||
STORESPR(fw + j*2, temp0);
|
||||
}
|
||||
#else
|
||||
for(j=0;j<n/4;j+=1) {
|
||||
fw[j*2] = fw0[j*2];
|
||||
fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
|
||||
}
|
||||
#endif
|
||||
w += n/4;
|
||||
}
|
||||
#else
|
||||
//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
|
||||
float *fw = (float *)w;
|
||||
V temp0, temp1, temp2;
|
||||
for(j=0;j<n/4;j+=2) {
|
||||
temp0 = VLD(fw0 + j*2);
|
||||
V re, im;
|
||||
re = VDUPRE(temp0);
|
||||
im = VDUPIM(temp0);
|
||||
im = VXOR(im, MULI_SIGN);
|
||||
VST(fw + j*4 , re);
|
||||
VST(fw + j*4+4, im);
|
||||
}
|
||||
w += n/4 * 2;
|
||||
#endif
|
||||
|
||||
FFTS_FREE(w0);
|
||||
}else{
|
||||
|
||||
cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
|
||||
cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
|
||||
cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
|
||||
|
||||
size_t j;
|
||||
for(j=0;j<n/8;j++) {
|
||||
w0[j][0] = W_re(n,j*2);
|
||||
w0[j][1] = W_im(n,j*2);
|
||||
w1[j][0] = W_re(n,j);
|
||||
w1[j][1] = W_im(n,j);
|
||||
w2[j][0] = W_re(n,j + (n/8));
|
||||
w2[j][1] = W_im(n,j + (n/8));
|
||||
|
||||
}
|
||||
|
||||
float *fw0 = (float *)w0;
|
||||
float *fw1 = (float *)w1;
|
||||
float *fw2 = (float *)w2;
|
||||
#ifdef __arm__
|
||||
//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
|
||||
float *fw = (float *)w;
|
||||
#ifdef HAVE_NEON
|
||||
VS temp0, temp1, temp2;
|
||||
for(j=0;j<n/8;j+=4) {
|
||||
temp0 = VLD2(fw0 + j*2);
|
||||
temp0.val[1] = VXOR(temp0.val[1], neg);
|
||||
STORESPR(fw + j*2*3, temp0);
|
||||
temp1 = VLD2(fw1 + j*2);
|
||||
temp1.val[1] = VXOR(temp1.val[1], neg);
|
||||
STORESPR(fw + j*2*3 + 8, temp1);
|
||||
temp2 = VLD2(fw2 + j*2);
|
||||
temp2.val[1] = VXOR(temp2.val[1], neg);
|
||||
STORESPR(fw + j*2*3 + 16, temp2);
|
||||
}
|
||||
#else
|
||||
for(j=0;j<n/8;j+=1) {
|
||||
fw[j*6] = fw0[j*2];
|
||||
fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
|
||||
fw[j*6+2] = fw1[j*2+0];
|
||||
fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
|
||||
fw[j*6+4] = fw2[j*2+0];
|
||||
fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
|
||||
}
|
||||
#endif
|
||||
w += n/8 * 3;
|
||||
#else
|
||||
//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
|
||||
float *fw = (float *)w;
|
||||
V temp0, temp1, temp2, re, im;
|
||||
for(j=0;j<n/8;j+=2) {
|
||||
temp0 = VLD(fw0 + j*2);
|
||||
re = VDUPRE(temp0);
|
||||
im = VDUPIM(temp0);
|
||||
im = VXOR(im, MULI_SIGN);
|
||||
VST(fw + j*2*6 , re);
|
||||
VST(fw + j*2*6+4, im);
|
||||
|
||||
temp1 = VLD(fw1 + j*2);
|
||||
re = VDUPRE(temp1);
|
||||
im = VDUPIM(temp1);
|
||||
im = VXOR(im, MULI_SIGN);
|
||||
VST(fw + j*2*6+8 , re);
|
||||
VST(fw + j*2*6+12, im);
|
||||
|
||||
temp2 = VLD(fw2 + j*2);
|
||||
re = VDUPRE(temp2);
|
||||
im = VDUPIM(temp2);
|
||||
im = VXOR(im, MULI_SIGN);
|
||||
VST(fw + j*2*6+16, re);
|
||||
VST(fw + j*2*6+20, im);
|
||||
}
|
||||
w += n/8 * 3 * 2;
|
||||
#endif
|
||||
|
||||
FFTS_FREE(w0);
|
||||
FFTS_FREE(w1);
|
||||
FFTS_FREE(w2);
|
||||
}
|
||||
///p->ws[i] = w;
|
||||
|
||||
n *= 2;
|
||||
}
|
||||
|
||||
float *tmp = (float *)p->ws;
|
||||
|
||||
if(sign < 0) {
|
||||
p->oe_ws = (void *)(&w_data[4]);
|
||||
p->ee_ws = (void *)(w_data);
|
||||
p->eo_ws = (void *)(&w_data[4]);
|
||||
}else{
|
||||
p->oe_ws = (void *)(w_data + 12);
|
||||
p->ee_ws = (void *)(w_data + 8);
|
||||
p->eo_ws = (void *)(w_data + 12);
|
||||
}
|
||||
|
||||
p->N = N;
|
||||
p->lastlut = w;
|
||||
p->n_luts = n_luts;
|
||||
#ifdef DYNAMIC_DISABLED
|
||||
if(sign < 0) {
|
||||
if(N >= 32) p->transform = ffts_static_transform_f;
|
||||
}else{
|
||||
if(N >= 32) p->transform = ffts_static_transform_i;
|
||||
}
|
||||
|
||||
#else
|
||||
if(N>=32) ffts_generate_func_code(p, N, leafN, sign);
|
||||
#endif
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
186
3rdparty/ffts/ffts-master/src/ffts.h
vendored
Normal file
186
3rdparty/ffts/ffts-master/src/ffts.h
vendored
Normal file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#ifndef __CP_SSE_H__
|
||||
#define __CP_SSE_H__
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
//#include <stdalign.h>
|
||||
|
||||
//#include "codegen.h"
|
||||
#include "types.h"
|
||||
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#include <android/log.h>
|
||||
#define LOG(s) __android_log_print(ANDROID_LOG_ERROR, "FFTS", s)
|
||||
#else
|
||||
#define LOG(s) fprintf(stderr, s)
|
||||
#endif
|
||||
|
||||
#define PI 3.1415926535897932384626433832795028841971693993751058209
|
||||
|
||||
static const __attribute__ ((aligned(64))) float w_data[16] = {
|
||||
0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
|
||||
-0.70710678118654757273731092936941, -0.70710678118654746171500846685376,
|
||||
1.0f, 0.70710678118654757273731092936941f,
|
||||
-0.0f, -0.70710678118654746171500846685376,
|
||||
0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
|
||||
0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
|
||||
1.0f, 0.70710678118654757273731092936941f,
|
||||
0.0f, 0.70710678118654746171500846685376
|
||||
};
|
||||
|
||||
__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
|
||||
__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
|
||||
|
||||
typedef size_t transform_index_t;
|
||||
|
||||
//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
|
||||
typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
|
||||
|
||||
typedef struct _ffts_plan_t ffts_plan_t;
|
||||
|
||||
/**
|
||||
* Contains all the Information need to perform FFT
|
||||
*
|
||||
*
|
||||
* DO NOT CHANGE THE ORDER OF MEMBERS
|
||||
* ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE
|
||||
* SOME OF THESE VARIABES!!
|
||||
*/
|
||||
struct _ffts_plan_t {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
ptrdiff_t *offsets;
|
||||
#ifdef DYNAMIC_DISABLED
|
||||
/**
|
||||
* Twiddle factors
|
||||
*/
|
||||
void *ws;
|
||||
/**
|
||||
* ee - 2 size x size8
|
||||
* oo - 2 x size4 in parallel
|
||||
* oe -
|
||||
*/
|
||||
void *oe_ws, *eo_ws, *ee_ws;
|
||||
#else
|
||||
void __attribute__((aligned(32))) *ws;
|
||||
void __attribute__((aligned(32))) *oe_ws, *eo_ws, *ee_ws;
|
||||
#endif
|
||||
/**
|
||||
* Pointer into an array of precomputed indexes for the input data array
|
||||
*/
|
||||
ptrdiff_t *is;
|
||||
|
||||
/**
|
||||
* Twiddle Factor Indexes
|
||||
*/
|
||||
size_t *ws_is;
|
||||
|
||||
/**
|
||||
* Size of the loops for the base cases
|
||||
*/
|
||||
size_t i0, i1, n_luts;
|
||||
|
||||
/**
|
||||
* Size fo the Transform
|
||||
*/
|
||||
size_t N;
|
||||
void *lastlut;
|
||||
/**
|
||||
* Used in multidimensional Code ??
|
||||
*/
|
||||
transform_index_t *transforms;
|
||||
//transform_func_t transform;
|
||||
|
||||
/**
|
||||
* Pointer to the dynamically generated function
|
||||
* that will execute the FFT
|
||||
*/
|
||||
void (*transform)(ffts_plan_t * , const void * , void * );
|
||||
|
||||
/**
|
||||
* Pointer to the base memory address of
|
||||
* of the transform function
|
||||
*/
|
||||
void *transform_base;
|
||||
|
||||
/**
|
||||
* Size of the memory block contain the
|
||||
* generated code
|
||||
*/
|
||||
size_t transform_size;
|
||||
|
||||
/**
|
||||
* Points to the cosnant variables used by
|
||||
* the Assembly Code
|
||||
*/
|
||||
void *constants;
|
||||
|
||||
// multi-dimensional stuff:
|
||||
struct _ffts_plan_t **plans;
|
||||
int rank;
|
||||
size_t *Ns, *Ms;
|
||||
void *buf;
|
||||
|
||||
void *transpose_buf;
|
||||
|
||||
/**
|
||||
* Pointer to the destroy function
|
||||
* to clean up the plan after use
|
||||
* (differs for real and multi dimension transforms
|
||||
*/
|
||||
void (*destroy)(ffts_plan_t *);
|
||||
|
||||
/**
|
||||
* Coefficiants for the real valued transforms
|
||||
*/
|
||||
float *A, *B;
|
||||
|
||||
size_t i2;
|
||||
};
|
||||
|
||||
|
||||
void ffts_free(ffts_plan_t *);
|
||||
ffts_plan_t *ffts_init_1d(size_t N, int sign);
|
||||
void ffts_execute(ffts_plan_t *, const void *, void *);
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
314
3rdparty/ffts/ffts-master/src/ffts_nd.c
vendored
Normal file
314
3rdparty/ffts/ffts-master/src/ffts_nd.c
vendored
Normal file
@@ -0,0 +1,314 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "ffts_nd.h"
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#include "neon.h"
|
||||
#endif
|
||||
|
||||
void ffts_free_nd(ffts_plan_t *p) {
|
||||
|
||||
int i;
|
||||
for(i=0;i<p->rank;i++) {
|
||||
|
||||
ffts_plan_t *x = p->plans[i];
|
||||
int k;
|
||||
for(k=0;k<i;k++) {
|
||||
if(p->Ms[i] == p->Ms[k]) x = NULL;
|
||||
}
|
||||
|
||||
if(x) ffts_free(x);
|
||||
}
|
||||
|
||||
free(p->Ns);
|
||||
free(p->Ms);
|
||||
free(p->plans);
|
||||
free(p->buf);
|
||||
free(p->transpose_buf);
|
||||
free(p);
|
||||
}
|
||||
#define TSIZE 8
|
||||
#include <string.h>
|
||||
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
size_t i,j,k;
|
||||
int linebytes = w*8;
|
||||
|
||||
for(j=0;j<h;j+=8) {
|
||||
for(i=0;i<w;i+=8) {
|
||||
neon_transpose_to_buf(in + j*w + i, buf, w);
|
||||
|
||||
uint64_t *p = out + i*h + j;
|
||||
uint64_t *pbuf = buf;
|
||||
uint64_t *ptemp;
|
||||
#if defined(__aarch64__) || defined(__arm64__)
|
||||
// This particular function comes out nicely using arm64 intrinsics; no need to deal with inline asm
|
||||
{
|
||||
uint64x2_t q8,q9,q10,q11,q12,q13,q14,q15;
|
||||
int x;
|
||||
for (x=0; x<4; x++)
|
||||
{
|
||||
ptemp = p;
|
||||
p += w;
|
||||
q8 = vld1q_u64(&pbuf[0]);
|
||||
q9 = vld1q_u64(&pbuf[2]);
|
||||
q10 = vld1q_u64(&pbuf[4]);
|
||||
q11 = vld1q_u64(&pbuf[6]);
|
||||
q12 = vld1q_u64(&pbuf[8]);
|
||||
q13 = vld1q_u64(&pbuf[10]);
|
||||
q14 = vld1q_u64(&pbuf[12]);
|
||||
q15 = vld1q_u64(&pbuf[14]);
|
||||
pbuf += 16;
|
||||
vst1q_u64(&ptemp[0], q8);
|
||||
vst1q_u64(&ptemp[2], q9);
|
||||
vst1q_u64(&ptemp[4], q10);
|
||||
vst1q_u64(&ptemp[6], q11);
|
||||
ptemp = p;
|
||||
p += w;
|
||||
vst1q_u64(&ptemp[0], q12);
|
||||
vst1q_u64(&ptemp[2], q13);
|
||||
vst1q_u64(&ptemp[4], q14);
|
||||
vst1q_u64(&ptemp[6], q15);
|
||||
} // for x
|
||||
} // aarch64
|
||||
#else
|
||||
__asm__ __volatile__(
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
|
||||
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
|
||||
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
|
||||
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"add %[p], %[p], %[w], lsl #3\n\t"
|
||||
"vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
|
||||
"vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
|
||||
"vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
|
||||
"mov %[ptemp], %[p]\n\t"
|
||||
"vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
|
||||
"vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
|
||||
|
||||
: [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
|
||||
: [w] "r" (w)
|
||||
: "memory", "q8", "q9", "q10", "q11"
|
||||
);
|
||||
#endif // 32 vs 64-bit version
|
||||
// out[i*h + j] = in[j*w + i];
|
||||
}
|
||||
}
|
||||
#else
|
||||
#ifdef HAVE_SSE
|
||||
uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
|
||||
int tx, ty;
|
||||
int x, y;
|
||||
int tw = w / TSIZE;
|
||||
int th = h / TSIZE;
|
||||
for (ty=0;ty<th;ty++) {
|
||||
for (tx=0;tx<tw;tx++) {
|
||||
uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
|
||||
uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE;
|
||||
|
||||
// Copy/transpose to tmp
|
||||
for (y=0;y<TSIZE;y+=2) {
|
||||
//for (x=0;x<TSIZE;x+=2) {
|
||||
//op[x*TSIZE] = ip[x];
|
||||
__m128d q0 = _mm_load_pd((double *)(ip0 + 0*w));
|
||||
__m128d q1 = _mm_load_pd((double *)(ip0 + 1*w));
|
||||
__m128d q2 = _mm_load_pd((double *)(ip0 + 2*w));
|
||||
__m128d q3 = _mm_load_pd((double *)(ip0 + 3*w));
|
||||
__m128d q4 = _mm_load_pd((double *)(ip0 + 4*w));
|
||||
__m128d q5 = _mm_load_pd((double *)(ip0 + 5*w));
|
||||
__m128d q6 = _mm_load_pd((double *)(ip0 + 6*w));
|
||||
__m128d q7 = _mm_load_pd((double *)(ip0 + 7*w));
|
||||
ip0 += 2;
|
||||
|
||||
__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
|
||||
__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
|
||||
__m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
|
||||
__m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
|
||||
__m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
|
||||
__m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
|
||||
__m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
|
||||
__m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
|
||||
//_mm_store_pd((double *)(op0 + y*h + x), t0);
|
||||
//_mm_store_pd((double *)(op0 + y*h + x + h), t1);
|
||||
_mm_store_pd((double *)(op0 + 0), t0);
|
||||
_mm_store_pd((double *)(op0 + 0 + TSIZE), t1);
|
||||
_mm_store_pd((double *)(op0 + 2 ), t2);
|
||||
_mm_store_pd((double *)(op0 + 2 + TSIZE), t3);
|
||||
_mm_store_pd((double *)(op0 + 4 ), t4);
|
||||
_mm_store_pd((double *)(op0 + 4 + TSIZE), t5);
|
||||
_mm_store_pd((double *)(op0 + 6 ), t6);
|
||||
_mm_store_pd((double *)(op0 + 6 + TSIZE), t7);
|
||||
//}
|
||||
op0 += 2*TSIZE;
|
||||
}
|
||||
|
||||
op0 = out + h*tx*TSIZE + ty*TSIZE;
|
||||
ip0 = tmp;
|
||||
for (y=0;y<TSIZE;y+=1) {
|
||||
// memcpy(op0, ip0, TSIZE * sizeof(*ip0));
|
||||
|
||||
__m128d q0 = _mm_load_pd((double *)(ip0 + 0));
|
||||
__m128d q1 = _mm_load_pd((double *)(ip0 + 2));
|
||||
__m128d q2 = _mm_load_pd((double *)(ip0 + 4));
|
||||
__m128d q3 = _mm_load_pd((double *)(ip0 + 6));
|
||||
_mm_store_pd((double *)(op0 + 0), q0);
|
||||
_mm_store_pd((double *)(op0 + 2), q1);
|
||||
_mm_store_pd((double *)(op0 + 4), q2);
|
||||
_mm_store_pd((double *)(op0 + 6), q3);
|
||||
|
||||
op0 += h;
|
||||
ip0 += TSIZE;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
/*
|
||||
size_t i,j;
|
||||
for(i=0;i<w;i+=2) {
|
||||
for(j=0;j<h;j+=2) {
|
||||
// out[i*h + j] = in[j*w + i];
|
||||
__m128d q0 = _mm_load_pd((double *)(in + j*w + i));
|
||||
__m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
|
||||
__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
|
||||
__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
|
||||
_mm_store_pd((double *)(out + i*h + j), t0);
|
||||
_mm_store_pd((double *)(out + i*h + j + h), t1);
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out) {
|
||||
|
||||
uint64_t *din = (uint64_t *)in;
|
||||
uint64_t *buf = p->buf;
|
||||
uint64_t *dout = (uint64_t *)out;
|
||||
|
||||
size_t i,j;
|
||||
for(i=0;i<p->Ns[0];i++) {
|
||||
p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));
|
||||
}
|
||||
ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);
|
||||
|
||||
for(i=1;i<p->rank;i++) {
|
||||
for(j=0;j<p->Ns[i];j++) {
|
||||
p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
|
||||
}
|
||||
ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
|
||||
}
|
||||
}
|
||||
|
||||
ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) {
|
||||
size_t vol = 1;
|
||||
|
||||
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
|
||||
|
||||
p->transform = &ffts_execute_nd;
|
||||
p->destroy = &ffts_free_nd;
|
||||
|
||||
p->rank = rank;
|
||||
p->Ns = malloc(sizeof(size_t) * rank);
|
||||
p->Ms = malloc(sizeof(size_t) * rank);
|
||||
p->plans = malloc(sizeof(ffts_plan_t **) * rank);
|
||||
int i;
|
||||
for(i=0;i<rank;i++) {
|
||||
p->Ns[i] = Ns[i];
|
||||
vol *= Ns[i];
|
||||
}
|
||||
p->buf = valloc(sizeof(float) * 2 * vol);
|
||||
|
||||
for(i=0;i<rank;i++) {
|
||||
p->Ms[i] = vol / p->Ns[i];
|
||||
|
||||
p->plans[i] = NULL;
|
||||
int k;
|
||||
for(k=0;k<i;k++) {
|
||||
if(p->Ms[k] == p->Ms[i])
|
||||
p->plans[i] = p->plans[k];
|
||||
}
|
||||
|
||||
if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
|
||||
}
|
||||
|
||||
p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) {
|
||||
size_t Ns[2];
|
||||
Ns[0] = N1;
|
||||
Ns[1] = N2;
|
||||
return ffts_init_nd(2, Ns, sign);
|
||||
}
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
59
3rdparty/ffts/ffts-master/src/ffts_nd.h
vendored
Normal file
59
3rdparty/ffts/ffts-master/src/ffts_nd.h
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __FFTS_ND_H__
|
||||
#define __FFTS_ND_H__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#ifdef HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
void ffts_free_nd(ffts_plan_t *p);
|
||||
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf);
|
||||
|
||||
void ffts_execute_nd(ffts_plan_t *p, const void * in, void * out);
|
||||
ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign);
|
||||
ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign);
|
||||
|
||||
#endif
|
||||
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
227
3rdparty/ffts/ffts-master/src/ffts_real.c
vendored
Normal file
227
3rdparty/ffts/ffts-master/src/ffts_real.c
vendored
Normal file
@@ -0,0 +1,227 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "ffts_real.h"
|
||||
|
||||
void ffts_free_1d_real(ffts_plan_t *p) {
|
||||
ffts_free(p->plans[0]);
|
||||
free(p->A);
|
||||
free(p->B);
|
||||
free(p->plans);
|
||||
free(p->buf);
|
||||
free(p);
|
||||
}
|
||||
|
||||
void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) {
|
||||
float *out = (float *)vout;
|
||||
float *buf = (float *)p->buf;
|
||||
float *A = p->A;
|
||||
float *B = p->B;
|
||||
|
||||
p->plans[0]->transform(p->plans[0], vin, buf);
|
||||
|
||||
size_t N = p->N;
|
||||
buf[N] = buf[0];
|
||||
buf[N+1] = buf[1];
|
||||
|
||||
float *p_buf0 = buf;
|
||||
float *p_buf1 = buf + N - 2;
|
||||
float *p_out = out;
|
||||
|
||||
size_t i;
|
||||
#ifdef __ARM_NEON__
|
||||
for(i=0;i<N/2;i+=2) {
|
||||
__asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
|
||||
"vld1.32 {q9}, [%[pb], :128]!\n\t"
|
||||
"vld1.32 {q10}, [%[buf0], :128]!\n\t"
|
||||
"vld1.32 {q11}, [%[buf1], :64]\n\t"
|
||||
"sub %[buf1], %[buf1], #16\n\t"
|
||||
|
||||
"vdup.32 d26, d16[1]\n\t"
|
||||
"vdup.32 d27, d17[1]\n\t"
|
||||
"vdup.32 d24, d16[0]\n\t"
|
||||
"vdup.32 d25, d17[0]\n\t"
|
||||
|
||||
"vdup.32 d30, d23[1]\n\t"
|
||||
"vdup.32 d31, d22[1]\n\t"
|
||||
"vdup.32 d28, d23[0]\n\t"
|
||||
"vdup.32 d29, d22[0]\n\t"
|
||||
|
||||
"vmul.f32 q13, q13, q10\n\t"
|
||||
"vmul.f32 q15, q15, q9\n\t"
|
||||
"vmul.f32 q12, q12, q10\n\t"
|
||||
"vmul.f32 q14, q14, q9\n\t"
|
||||
"vrev64.f32 q13, q13\n\t"
|
||||
"vrev64.f32 q15, q15\n\t"
|
||||
|
||||
"vtrn.32 d26, d27\n\t"
|
||||
"vtrn.32 d30, d31\n\t"
|
||||
"vneg.f32 d26, d26\n\t"
|
||||
"vneg.f32 d31, d31\n\t"
|
||||
"vtrn.32 d26, d27\n\t"
|
||||
"vtrn.32 d30, d31\n\t"
|
||||
|
||||
"vadd.f32 q12, q12, q14\n\t"
|
||||
"vadd.f32 q13, q13, q15\n\t"
|
||||
"vadd.f32 q12, q12, q13\n\t"
|
||||
"vst1.32 {q12}, [%[pout], :128]!\n\t"
|
||||
: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
|
||||
[pout] "+r" (p_out)
|
||||
:
|
||||
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
#else
|
||||
for(i=0;i<N/2;i++) {
|
||||
out[2*i] = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1];
|
||||
out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i];
|
||||
|
||||
// out[2*N-2*i] = out[2*i];
|
||||
// out[2*N-2*i+1] = -out[2*i+1];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
out[N] = buf[0] - buf[1];
|
||||
out[N+1] = 0.0f;
|
||||
|
||||
}
|
||||
|
||||
void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
|
||||
float *out = (float *)vout;
|
||||
float *in = (float *)vin;
|
||||
float *buf = (float *)p->buf;
|
||||
float *A = p->A;
|
||||
float *B = p->B;
|
||||
size_t N = p->N;
|
||||
|
||||
float *p_buf0 = in;
|
||||
float *p_buf1 = in + N - 2;
|
||||
|
||||
float *p_out = buf;
|
||||
|
||||
size_t i;
|
||||
#ifdef __ARM_NEON__
|
||||
for(i=0;i<N/2;i+=2) {
|
||||
__asm__ __volatile__ ("vld1.32 {q8}, [%[pa], :128]!\n\t"
|
||||
"vld1.32 {q9}, [%[pb], :128]!\n\t"
|
||||
"vld1.32 {q10}, [%[buf0], :128]!\n\t"
|
||||
"vld1.32 {q11}, [%[buf1], :64]\n\t"
|
||||
"sub %[buf1], %[buf1], #16\n\t"
|
||||
|
||||
"vdup.32 d26, d16[1]\n\t"
|
||||
"vdup.32 d27, d17[1]\n\t"
|
||||
"vdup.32 d24, d16[0]\n\t"
|
||||
"vdup.32 d25, d17[0]\n\t"
|
||||
|
||||
"vdup.32 d30, d23[1]\n\t"
|
||||
"vdup.32 d31, d22[1]\n\t"
|
||||
"vdup.32 d28, d23[0]\n\t"
|
||||
"vdup.32 d29, d22[0]\n\t"
|
||||
|
||||
"vmul.f32 q13, q13, q10\n\t"
|
||||
"vmul.f32 q15, q15, q9\n\t"
|
||||
"vmul.f32 q12, q12, q10\n\t"
|
||||
"vmul.f32 q14, q14, q9\n\t"
|
||||
"vrev64.f32 q13, q13\n\t"
|
||||
"vrev64.f32 q15, q15\n\t"
|
||||
|
||||
"vtrn.32 d26, d27\n\t"
|
||||
"vtrn.32 d28, d29\n\t"
|
||||
"vneg.f32 d27, d27\n\t"
|
||||
"vneg.f32 d29, d29\n\t"
|
||||
"vtrn.32 d26, d27\n\t"
|
||||
"vtrn.32 d28, d29\n\t"
|
||||
|
||||
"vadd.f32 q12, q12, q14\n\t"
|
||||
"vsub.f32 q13, q13, q15\n\t"
|
||||
"vadd.f32 q12, q12, q13\n\t"
|
||||
"vst1.32 {q12}, [%[pout], :128]!\n\t"
|
||||
: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
|
||||
[pout] "+r" (p_out)
|
||||
:
|
||||
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
|
||||
|
||||
#else
|
||||
for(i=0;i<N/2;i++) {
|
||||
buf[2*i] = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
|
||||
buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
|
||||
#endif
|
||||
}
|
||||
|
||||
p->plans[0]->transform(p->plans[0], buf, out);
|
||||
|
||||
}
|
||||
|
||||
ffts_plan_t *ffts_init_1d_real(size_t N, int sign) {
|
||||
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
|
||||
|
||||
if(sign < 0) p->transform = &ffts_execute_1d_real;
|
||||
else p->transform = &ffts_execute_1d_real_inv;
|
||||
|
||||
p->destroy = &ffts_free_1d_real;
|
||||
p->N = N;
|
||||
p->rank = 1;
|
||||
p->plans = malloc(sizeof(ffts_plan_t **) * 1);
|
||||
|
||||
p->plans[0] = ffts_init_1d(N/2, sign);
|
||||
|
||||
p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1));
|
||||
|
||||
p->A = valloc(sizeof(float) * N);
|
||||
p->B = valloc(sizeof(float) * N);
|
||||
|
||||
if(sign < 0) {
|
||||
int i;
|
||||
for (i = 0; i < N/2; i++) {
|
||||
p->A[2 * i] = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
|
||||
p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
|
||||
p->B[2 * i] = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
|
||||
p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
|
||||
}
|
||||
}else{
|
||||
int i;
|
||||
for (i = 0; i < N/2; i++) {
|
||||
p->A[2 * i] = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
|
||||
p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
|
||||
p->B[2 * i] = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
|
||||
p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
54
3rdparty/ffts/ffts-master/src/ffts_real.h
vendored
Normal file
54
3rdparty/ffts/ffts-master/src/ffts_real.h
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __FFTS_REAL_H__
|
||||
#define __FFTS_REAL_H__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#ifdef HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
|
||||
|
||||
#endif
|
||||
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
197
3rdparty/ffts/ffts-master/src/ffts_real_nd.c
vendored
Normal file
197
3rdparty/ffts/ffts-master/src/ffts_real_nd.c
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "ffts_real_nd.h"
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
#include "neon.h"
|
||||
#endif
|
||||
|
||||
void ffts_free_nd_real(ffts_plan_t *p) {
|
||||
|
||||
int i;
|
||||
for(i=0;i<p->rank;i++) {
|
||||
|
||||
ffts_plan_t *x = p->plans[i];
|
||||
|
||||
int k;
|
||||
for(k=i+1;k<p->rank;k++) {
|
||||
if(x == p->plans[k]) p->plans[k] = NULL;
|
||||
}
|
||||
|
||||
if(x) ffts_free(x);
|
||||
}
|
||||
|
||||
free(p->Ns);
|
||||
free(p->Ms);
|
||||
free(p->plans);
|
||||
free(p->buf);
|
||||
free(p->transpose_buf);
|
||||
free(p);
|
||||
}
|
||||
|
||||
void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
|
||||
|
||||
size_t i,j;
|
||||
for(i=0;i<w;i+=1) {
|
||||
for(j=0;j<h;j+=1) {
|
||||
out[i*h + j] = in[j*w + i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ffts_execute_nd_real(ffts_plan_t *p, const void * in, void * out) {
|
||||
|
||||
uint32_t *din = (uint32_t *)in;
|
||||
uint64_t *buf = p->buf;
|
||||
uint64_t *dout = (uint64_t *)out;
|
||||
|
||||
size_t i,j;
|
||||
for(i=0;i<p->Ns[0];i++) {
|
||||
p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));
|
||||
}
|
||||
ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);
|
||||
|
||||
for(i=1;i<p->rank;i++) {
|
||||
for(j=0;j<p->Ns[i];j++) {
|
||||
p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));
|
||||
}
|
||||
ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);
|
||||
}
|
||||
}
|
||||
|
||||
void ffts_execute_nd_real_inv(ffts_plan_t *p, const void * in, void * out) {
|
||||
|
||||
uint64_t *din = (uint64_t *)in;
|
||||
uint64_t *buf = p->buf;
|
||||
uint64_t *buf2;
|
||||
uint64_t *dout = (uint64_t *)out;
|
||||
size_t vol = 1;
|
||||
|
||||
float *bufr = (float *)(p->buf);
|
||||
float *doutr = (float *)out;
|
||||
|
||||
size_t i,j;
|
||||
|
||||
for(i=0;i<p->rank;i++) {
|
||||
vol *= p->Ns[i];
|
||||
}
|
||||
|
||||
buf2 = buf + vol;
|
||||
|
||||
ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);
|
||||
|
||||
for(i=0;i<p->Ms[0];i++) {
|
||||
p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0]));
|
||||
}
|
||||
|
||||
ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf);
|
||||
for(j=0;j<p->Ms[1];j++) {
|
||||
p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);
|
||||
}
|
||||
}
|
||||
|
||||
ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
|
||||
size_t vol = 1;
|
||||
size_t bufsize;
|
||||
|
||||
ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
|
||||
|
||||
if(sign < 0) p->transform = &ffts_execute_nd_real;
|
||||
else p->transform = &ffts_execute_nd_real_inv;
|
||||
|
||||
p->destroy = &ffts_free_nd_real;
|
||||
|
||||
p->rank = rank;
|
||||
p->Ns = malloc(sizeof(size_t) * rank);
|
||||
p->Ms = malloc(sizeof(size_t) * rank);
|
||||
p->plans = malloc(sizeof(ffts_plan_t **) * rank);
|
||||
int i;
|
||||
for(i=0;i<rank;i++) {
|
||||
p->Ns[i] = Ns[i];
|
||||
vol *= Ns[i];
|
||||
}
|
||||
|
||||
//There is probably a prettier way of doing this, but it works..
|
||||
if(sign < 0) {
|
||||
bufsize = 2 * vol;
|
||||
}
|
||||
else {
|
||||
bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
|
||||
}
|
||||
|
||||
p->buf = valloc(sizeof(float) * bufsize);
|
||||
|
||||
for(i=0;i<rank;i++) {
|
||||
p->Ms[i] = vol / p->Ns[i];
|
||||
|
||||
p->plans[i] = NULL;
|
||||
int k;
|
||||
|
||||
if(sign < 0) {
|
||||
for(k=1;k<i;k++) {
|
||||
if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
|
||||
}
|
||||
if(!i) p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
|
||||
else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign);
|
||||
}else{
|
||||
for(k=0;k<i;k++) {
|
||||
if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
|
||||
}
|
||||
if(i==rank-1) p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
|
||||
else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign);
|
||||
}
|
||||
}
|
||||
if(sign < 0) {
|
||||
for(i=1;i<rank;i++) {
|
||||
p->Ns[i] = p->Ns[i] / 2 + 1;
|
||||
}
|
||||
}else{
|
||||
for(i=0;i<rank-1;i++) {
|
||||
p->Ms[i] = p->Ms[i] / 2 + 1;
|
||||
}
|
||||
}
|
||||
|
||||
p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
|
||||
size_t Ns[2];
|
||||
Ns[0] = N1;
|
||||
Ns[1] = N2;
|
||||
return ffts_init_nd_real(2, Ns, sign);
|
||||
}
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
54
3rdparty/ffts/ffts-master/src/ffts_real_nd.h
vendored
Normal file
54
3rdparty/ffts/ffts-master/src/ffts_real_nd.h
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __FFTS_REAL_ND_H__
|
||||
#define __FFTS_REAL_ND_H__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ffts_nd.h"
|
||||
#include "ffts_real.h"
|
||||
#include "ffts.h"
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#ifdef HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
157
3rdparty/ffts/ffts-master/src/ffts_small.c
vendored
Normal file
157
3rdparty/ffts/ffts-master/src/ffts_small.c
vendored
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
|
||||
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "ffts.h"
|
||||
#include "macros.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define DEBUG(x)
|
||||
|
||||
#include "ffts_small.h"
|
||||
|
||||
void firstpass_16_f(ffts_plan_t * p, const void * in, void * out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
|
||||
float *LUT8 = p->ws;
|
||||
|
||||
L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
|
||||
L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
|
||||
K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
|
||||
K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
|
||||
S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
|
||||
K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
|
||||
S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
|
||||
}
|
||||
|
||||
void firstpass_16_b(ffts_plan_t * p, const void * in, void * out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
|
||||
float *LUT8 = p->ws;
|
||||
|
||||
L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
|
||||
L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
|
||||
K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
|
||||
K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
|
||||
S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
|
||||
K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
|
||||
S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
|
||||
}
|
||||
|
||||
|
||||
void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
V r0_1, r2_3, r4_5, r6_7;
|
||||
float *LUT8 = p->ws + p->ws_is[0];
|
||||
|
||||
L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
|
||||
K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
|
||||
S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
|
||||
}
|
||||
|
||||
void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
V r0_1, r2_3, r4_5, r6_7;
|
||||
float *LUT8 = p->ws + p->ws_is[0];
|
||||
|
||||
L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
|
||||
K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
|
||||
S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
|
||||
}
|
||||
|
||||
|
||||
void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
t0[0] = din[0]; t0[1] = din[1];
|
||||
t1[0] = din[4]; t1[1] = din[5];
|
||||
t2[0] = din[2]; t2[1] = din[3];
|
||||
t3[0] = din[6]; t3[1] = din[7];
|
||||
|
||||
t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
|
||||
t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
|
||||
t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
|
||||
t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
|
||||
|
||||
dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
|
||||
dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
|
||||
dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
|
||||
dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
|
||||
}
|
||||
|
||||
void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
t0[0] = din[0]; t0[1] = din[1];
|
||||
t1[0] = din[4]; t1[1] = din[5];
|
||||
t2[0] = din[2]; t2[1] = din[3];
|
||||
t3[0] = din[6]; t3[1] = din[7];
|
||||
|
||||
t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
|
||||
t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
|
||||
t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
|
||||
t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
|
||||
|
||||
dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
|
||||
dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
|
||||
dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
|
||||
dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
|
||||
}
|
||||
|
||||
void firstpass_2(ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const data_t *din = (const data_t *)in;
|
||||
data_t *dout = (data_t *)out;
|
||||
cdata_t t0, t1, r0,r1;
|
||||
t0[0] = din[0]; t0[1] = din[1];
|
||||
t1[0] = din[2]; t1[1] = din[3];
|
||||
r0[0] = t0[0] + t1[0];
|
||||
r0[1] = t0[1] + t1[1];
|
||||
r1[0] = t0[0] - t1[0];
|
||||
r1[1] = t0[1] - t1[1];
|
||||
dout[0] = r0[0]; dout[1] = r0[1];
|
||||
dout[2] = r1[0]; dout[3] = r1[1];
|
||||
}
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
14
3rdparty/ffts/ffts-master/src/ffts_small.h
vendored
Normal file
14
3rdparty/ffts/ffts-master/src/ffts_small.h
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef __FFTS_SMALL_H__
|
||||
#define __FFTS_SMALL_H__
|
||||
|
||||
|
||||
void firstpass_16_f(ffts_plan_t * p, const void * in, void * out);
|
||||
void firstpass_16_b(ffts_plan_t * p, const void * in, void * out);
|
||||
void firstpass_8_f(ffts_plan_t * p, const void * in, void * out);
|
||||
void firstpass_8_b(ffts_plan_t * p, const void * in, void * out);
|
||||
void firstpass_4_f(ffts_plan_t * p, const void * in, void * out);
|
||||
void firstpass_4_b(ffts_plan_t * p, const void * in, void * out);
|
||||
void firstpass_2(ffts_plan_t * p, const void * in, void * out);
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
102
3rdparty/ffts/ffts-master/src/ffts_static.c
vendored
Normal file
102
3rdparty/ffts/ffts-master/src/ffts_static.c
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#include "ffts_static.h"
|
||||
|
||||
void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) {
|
||||
if(N > 16) {
|
||||
size_t N1 = N >> 1;
|
||||
size_t N2 = N >> 2;
|
||||
size_t N3 = N >> 3;
|
||||
float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
|
||||
|
||||
ffts_static_rec_i(p, data, N2);
|
||||
ffts_static_rec_i(p, data + N1, N3);
|
||||
ffts_static_rec_i(p, data + N1 + N2, N3);
|
||||
ffts_static_rec_i(p, data + N, N2);
|
||||
ffts_static_rec_i(p, data + N + N1, N2);
|
||||
|
||||
if(N == p->N) {
|
||||
neon_static_x8_t_i(data, N, ws);
|
||||
}else{
|
||||
neon_static_x8_i(data, N, ws);
|
||||
}
|
||||
|
||||
}else if(N==16){
|
||||
neon_static_x4_i(data, N, p->ws);
|
||||
}
|
||||
|
||||
}
|
||||
void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) {
|
||||
if(N > 16) {
|
||||
size_t N1 = N >> 1;
|
||||
size_t N2 = N >> 2;
|
||||
size_t N3 = N >> 3;
|
||||
float *ws = ((float *)(p->ws)) + (p->ws_is[__builtin_ctzl(N)-4] << 1);
|
||||
|
||||
ffts_static_rec_f(p, data, N2);
|
||||
ffts_static_rec_f(p, data + N1, N3);
|
||||
ffts_static_rec_f(p, data + N1 + N2, N3);
|
||||
ffts_static_rec_f(p, data + N, N2);
|
||||
ffts_static_rec_f(p, data + N + N1, N2);
|
||||
|
||||
if(N == p->N) {
|
||||
neon_static_x8_t_f(data, N, ws);
|
||||
}else{
|
||||
neon_static_x8_f(data, N, ws);
|
||||
}
|
||||
|
||||
}else if(N==16){
|
||||
neon_static_x4_f(data, N, p->ws);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out) {
|
||||
|
||||
if(__builtin_ctzl(p->N) & 1)
|
||||
neon_static_o_f(p, in, out);
|
||||
else
|
||||
neon_static_e_f(p, in, out);
|
||||
ffts_static_rec_f(p, out, p->N);
|
||||
}
|
||||
|
||||
|
||||
void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out) {
|
||||
|
||||
if(__builtin_ctzl(p->N) & 1)
|
||||
neon_static_o_i(p, in, out);
|
||||
else
|
||||
neon_static_e_i(p, in, out);
|
||||
ffts_static_rec_i(p, out, p->N);
|
||||
}
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
47
3rdparty/ffts/ffts-master/src/ffts_static.h
vendored
Normal file
47
3rdparty/ffts/ffts-master/src/ffts_static.h
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __FFTS_STATIC_H__
|
||||
#define __FFTS_STATIC_H__
|
||||
|
||||
#include "ffts.h"
|
||||
#include "neon.h"
|
||||
|
||||
void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ;
|
||||
void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out);
|
||||
|
||||
void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ;
|
||||
void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out);
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
207
3rdparty/ffts/ffts-master/src/macros-alpha.h
vendored
Normal file
207
3rdparty/ffts/ffts-master/src/macros-alpha.h
vendored
Normal file
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
|
||||
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __MACROS_ALPHA_H__
|
||||
#define __MACROS_ALPHA_H__
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef __alpha__
|
||||
#define restrict
|
||||
#endif
|
||||
|
||||
typedef struct {float r1, i1, r2, i2;} V;
|
||||
|
||||
#define FFTS_MALLOC(d,a) malloc(d)
|
||||
#define FFTS_FREE(d) free(d)
|
||||
|
||||
#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3})
|
||||
|
||||
static inline V VADD(V x, V y)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r1 + y.r1;
|
||||
z.i1 = x.i1 + y.i1;
|
||||
z.r2 = x.r2 + y.r2;
|
||||
z.i2 = x.i2 + y.i2;
|
||||
return z;
|
||||
}
|
||||
|
||||
|
||||
static inline V VSUB(V x, V y)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r1 - y.r1;
|
||||
z.i1 = x.i1 - y.i1;
|
||||
z.r2 = x.r2 - y.r2;
|
||||
z.i2 = x.i2 - y.i2;
|
||||
return z;
|
||||
}
|
||||
|
||||
|
||||
static inline V VMUL(V x, V y)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r1 * y.r1;
|
||||
z.i1 = x.i1 * y.i1;
|
||||
z.r2 = x.r2 * y.r2;
|
||||
z.i2 = x.i2 * y.i2;
|
||||
return z;
|
||||
}
|
||||
|
||||
static inline V VXOR(V x, V y)
|
||||
{
|
||||
V r;
|
||||
r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1;
|
||||
r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1;
|
||||
r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2;
|
||||
r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2;
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline V VSWAPPAIRS(V x)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.i1;
|
||||
z.i1 = x.r1;
|
||||
z.r2 = x.i2;
|
||||
z.i2 = x.r2;
|
||||
return z;
|
||||
}
|
||||
|
||||
|
||||
static inline V VBLEND(V x, V y)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r1;
|
||||
z.i1 = x.i1;
|
||||
z.r2 = y.r2;
|
||||
z.i2 = y.i2;
|
||||
return z;
|
||||
}
|
||||
|
||||
static inline V VUNPACKHI(V x, V y)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r2;
|
||||
z.i1 = x.i2;
|
||||
z.r2 = y.r2;
|
||||
z.i2 = y.i2;
|
||||
return z;
|
||||
}
|
||||
|
||||
static inline V VUNPACKLO(V x, V y)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r1;
|
||||
z.i1 = x.i1;
|
||||
z.r2 = y.r1;
|
||||
z.i2 = y.i1;
|
||||
return z;
|
||||
}
|
||||
|
||||
static inline V VDUPRE(V x)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.r1;
|
||||
z.i1 = x.r1;
|
||||
z.r2 = x.r2;
|
||||
z.i2 = x.r2;
|
||||
return z;
|
||||
}
|
||||
|
||||
static inline V VDUPIM(V x)
|
||||
{
|
||||
V z;
|
||||
z.r1 = x.i1;
|
||||
z.i1 = x.i1;
|
||||
z.r2 = x.i2;
|
||||
z.i2 = x.i2;
|
||||
return z;
|
||||
}
|
||||
|
||||
static inline V IMUL(V d, V re, V im)
|
||||
{
|
||||
re = VMUL(re, d);
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VSUB(re, im);
|
||||
}
|
||||
|
||||
|
||||
static inline V IMULJ(V d, V re, V im)
|
||||
{
|
||||
re = VMUL(re, d);
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VADD(re, im);
|
||||
}
|
||||
|
||||
static inline V MULI(int inv, V x)
|
||||
{
|
||||
V z;
|
||||
|
||||
if (inv) {
|
||||
z.r1 = -x.r1;
|
||||
z.i1 = x.i1;
|
||||
z.r2 = -x.r2;
|
||||
z.i2 = x.i2;
|
||||
}else{
|
||||
z.r1 = x.r1;
|
||||
z.i1 = -x.i1;
|
||||
z.r2 = x.r2;
|
||||
z.i2 = -x.i2;
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
|
||||
static inline V IMULI(int inv, V x)
|
||||
{
|
||||
return VSWAPPAIRS(MULI(inv, x));
|
||||
}
|
||||
|
||||
|
||||
static inline V VLD(const void *s)
|
||||
{
|
||||
V *d = (V *)s;
|
||||
return *d;
|
||||
}
|
||||
|
||||
|
||||
static inline void VST(void *d, V s)
|
||||
{
|
||||
V *r = (V *)d;
|
||||
*r = s;
|
||||
}
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
138
3rdparty/ffts/ffts-master/src/macros-altivec.h
vendored
Normal file
138
3rdparty/ffts/ffts-master/src/macros-altivec.h
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
|
||||
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __MACROS_ALTIVEC_H__
|
||||
#define __MACROS_ALTIVEC_H__
|
||||
|
||||
#include <math.h>
|
||||
#include <altivec.h>
|
||||
|
||||
#define restrict
|
||||
|
||||
typedef vector float V;
|
||||
typedef vector unsigned char VUC;
|
||||
|
||||
#ifdef __apple__
|
||||
#define FFTS_MALLOC(d,a) vec_malloc(d)
|
||||
#define FFTS_FREE(d) vec_free(d)
|
||||
#else
|
||||
/* It appears vec_malloc() and friends are not implemented on Linux */
|
||||
#include <malloc.h>
|
||||
#define FFTS_MALLOC(d,a) memalign(16,d)
|
||||
#define FFTS_FREE(d) free(d)
|
||||
#endif
|
||||
|
||||
#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
|
||||
|
||||
#define VADD(x,y) vec_add(x,y)
|
||||
#define VSUB(x,y) vec_sub(x,y)
|
||||
#define VMUL(x,y) vec_madd(x,y,(V){0})
|
||||
#define VMULADD(x,y,z) vec_madd(x,y,z)
|
||||
#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
|
||||
#define VXOR(x,y) vec_xor((x),(y))
|
||||
#define VSWAPPAIRS(x) \
|
||||
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \
|
||||
0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
|
||||
|
||||
#define VBLEND(x,y) \
|
||||
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
|
||||
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
|
||||
|
||||
#define VUNPACKHI(x,y) \
|
||||
vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \
|
||||
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
|
||||
|
||||
#define VUNPACKLO(x,y) \
|
||||
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
|
||||
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
|
||||
|
||||
#define VDUPRE(x) \
|
||||
vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \
|
||||
0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
|
||||
|
||||
#define VDUPIM(x) \
|
||||
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \
|
||||
0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
|
||||
|
||||
|
||||
static inline V IMUL(V d, V re, V im)
|
||||
{
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
re = VMUL(re, d);
|
||||
return VSUB(re, im);
|
||||
}
|
||||
|
||||
|
||||
static inline V IMULJ(V d, V re, V im)
|
||||
{
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VMULADD(re, d, im);
|
||||
}
|
||||
|
||||
#ifndef __GNUC__
|
||||
/* gcc (4.6 and 4.7) ICEs on this code! */
|
||||
static inline V MULI(int inv, V x)
|
||||
{
|
||||
return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
|
||||
}
|
||||
#else
|
||||
/* but compiles this fine... */
|
||||
static inline V MULI(int inv, V x)
|
||||
{
|
||||
V t;
|
||||
t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
|
||||
return VXOR(x, t);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static inline V IMULI(int inv, V x)
|
||||
{
|
||||
return VSWAPPAIRS(MULI(inv, x));
|
||||
}
|
||||
|
||||
|
||||
static inline V VLD(const void *s)
|
||||
{
|
||||
V *d = (V *)s;
|
||||
return *d;
|
||||
}
|
||||
|
||||
|
||||
static inline void VST(void *d, V s)
|
||||
{
|
||||
V *r = (V *)d;
|
||||
*r = s;
|
||||
}
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
97
3rdparty/ffts/ffts-master/src/macros-neon.h
vendored
Normal file
97
3rdparty/ffts/ffts-master/src/macros-neon.h
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
#ifndef __MACROS_NEON_H__
|
||||
#define __MACROS_NEON_H__
|
||||
|
||||
#include "neon.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
typedef float32x4_t V;
|
||||
|
||||
typedef float32x4x2_t VS;
|
||||
|
||||
#define ADD vaddq_f32
|
||||
#define SUB vsubq_f32
|
||||
#define MUL vmulq_f32
|
||||
#define VADD vaddq_f32
|
||||
#define VSUB vsubq_f32
|
||||
#define VMUL vmulq_f32
|
||||
#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
|
||||
#define VST vst1q_f32
|
||||
#define VLD vld1q_f32
|
||||
#define VST2 vst2q_f32
|
||||
#define VLD2 vld2q_f32
|
||||
|
||||
#define VSWAPPAIRS(x) (vrev64q_f32(x))
|
||||
|
||||
#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
|
||||
#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
|
||||
|
||||
#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
|
||||
|
||||
__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
|
||||
data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
|
||||
return VLD(d);
|
||||
}
|
||||
|
||||
#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
|
||||
#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
|
||||
|
||||
#define FFTS_MALLOC(d,a) (valloc(d))
|
||||
#define FFTS_FREE(d) (free(d))
|
||||
|
||||
__INLINE void STORESPR(data_t * addr, VS p) {
|
||||
|
||||
vst1q_f32(addr, p.val[0]);
|
||||
vst1q_f32(addr + 4, p.val[1]);
|
||||
|
||||
}
|
||||
|
||||
__INLINE V IMULI(int inv, V a) {
|
||||
if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
|
||||
else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
|
||||
}
|
||||
|
||||
__INLINE V IMUL(V d, V re, V im) {
|
||||
re = VMUL(re, d);
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VSUB(re, im);
|
||||
}
|
||||
|
||||
__INLINE V IMULJ(V d, V re, V im) {
|
||||
re = VMUL(re, d);
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VADD(re, im);
|
||||
}
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
85
3rdparty/ffts/ffts-master/src/macros-sse.h
vendored
Normal file
85
3rdparty/ffts/ffts-master/src/macros-sse.h
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __SSE_FLOAT_H__
|
||||
#define __SSE_FLOAT_H__
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
//#define VL 4
|
||||
|
||||
typedef __m128 V;
|
||||
|
||||
#define VADD _mm_add_ps
|
||||
#define VSUB _mm_sub_ps
|
||||
#define VMUL _mm_mul_ps
|
||||
//#define VLIT4 _mm_set_ps
|
||||
#define VXOR _mm_xor_ps
|
||||
#define VST _mm_store_ps
|
||||
#define VLD _mm_load_ps
|
||||
|
||||
#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)))
|
||||
|
||||
#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2)))
|
||||
#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0)))
|
||||
|
||||
#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0)))
|
||||
|
||||
#define VLIT4 _mm_set_ps
|
||||
|
||||
#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0)))
|
||||
#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1)))
|
||||
|
||||
#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
|
||||
#define FFTS_FREE(d) (_mm_free(d))
|
||||
|
||||
__INLINE V IMULI(int inv, V a) {
|
||||
if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
|
||||
else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
|
||||
}
|
||||
|
||||
|
||||
__INLINE V IMUL(V d, V re, V im) {
|
||||
re = VMUL(re, d);
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VSUB(re, im);
|
||||
}
|
||||
|
||||
__INLINE V IMULJ(V d, V re, V im) {
|
||||
re = VMUL(re, d);
|
||||
im = VMUL(im, VSWAPPAIRS(d));
|
||||
return VADD(re, im);
|
||||
}
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
162
3rdparty/ffts/ffts-master/src/macros.h
vendored
Normal file
162
3rdparty/ffts/ffts-master/src/macros.h
vendored
Normal file
@@ -0,0 +1,162 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
|
||||
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __MACROS_H__
|
||||
#define __MACROS_H__
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#include "macros-neon.h"
|
||||
#else
|
||||
#ifdef __alpha__
|
||||
#include "macros-alpha.h"
|
||||
#else
|
||||
#ifdef __powerpc__
|
||||
#include "macros-altivec.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef HAVE_VFP
|
||||
#include "macros-alpha.h"
|
||||
#endif
|
||||
#ifdef HAVE_SSE
|
||||
#include "macros-sse.h"
|
||||
#endif
|
||||
|
||||
static inline void TX2(V *a, V *b)
|
||||
{
|
||||
V TX2_t0 = VUNPACKLO(*a, *b);
|
||||
V TX2_t1 = VUNPACKHI(*a, *b);
|
||||
*a = TX2_t0; *b = TX2_t1;
|
||||
}
|
||||
|
||||
static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
|
||||
{
|
||||
V uk, uk2, zk_p, zk_n, zk, zk_d;
|
||||
uk = *r0; uk2 = *r1;
|
||||
zk_p = IMUL(*r2, re, im);
|
||||
zk_n = IMULJ(*r3, re, im);
|
||||
|
||||
zk = VADD(zk_p, zk_n);
|
||||
zk_d = IMULI(inv, VSUB(zk_p, zk_n));
|
||||
|
||||
*r2 = VSUB(uk, zk);
|
||||
*r0 = VADD(uk, zk);
|
||||
*r3 = VADD(uk2, zk_d);
|
||||
*r1 = VSUB(uk2, zk_d);
|
||||
}
|
||||
|
||||
|
||||
static inline void S_4(V r0, V r1, V r2, V r3,
|
||||
data_t * restrict o0, data_t * restrict o1,
|
||||
data_t * restrict o2, data_t * restrict o3)
|
||||
{
|
||||
VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
|
||||
}
|
||||
|
||||
|
||||
static inline void L_2_4(int inv,
|
||||
const data_t * restrict i0, const data_t * restrict i1,
|
||||
const data_t * restrict i2, const data_t * restrict i3,
|
||||
V *r0, V *r1, V *r2, V *r3)
|
||||
{
|
||||
V t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
|
||||
t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
|
||||
t4 = VADD(t0, t1);
|
||||
t5 = VSUB(t0, t1);
|
||||
t6 = VADD(t2, t3);
|
||||
t7 = VSUB(t2, t3);
|
||||
*r0 = VUNPACKLO(t4, t5);
|
||||
*r1 = VUNPACKLO(t6, t7);
|
||||
t5 = IMULI(inv, t5);
|
||||
t0 = VADD(t6, t4);
|
||||
t2 = VSUB(t6, t4);
|
||||
t1 = VSUB(t7, t5);
|
||||
t3 = VADD(t7, t5);
|
||||
*r3 = VUNPACKHI(t0, t1);
|
||||
*r2 = VUNPACKHI(t2, t3);
|
||||
}
|
||||
|
||||
|
||||
static inline void L_4_4(int inv,
|
||||
const data_t * restrict i0, const data_t * restrict i1,
|
||||
const data_t * restrict i2, const data_t * restrict i3,
|
||||
V *r0, V *r1, V *r2, V *r3)
|
||||
{
|
||||
V t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
|
||||
t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
|
||||
t4 = VADD(t0, t1);
|
||||
t5 = VSUB(t0, t1);
|
||||
t6 = VADD(t2, t3);
|
||||
t7 = IMULI(inv, VSUB(t2, t3));
|
||||
t0 = VADD(t4, t6);
|
||||
t2 = VSUB(t4, t6);
|
||||
t1 = VSUB(t5, t7);
|
||||
t3 = VADD(t5, t7);
|
||||
TX2(&t0, &t1);
|
||||
TX2(&t2, &t3);
|
||||
*r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline void L_4_2(int inv,
|
||||
const data_t * restrict i0, const data_t * restrict i1,
|
||||
const data_t * restrict i2, const data_t * restrict i3,
|
||||
V *r0, V *r1, V *r2, V *r3)
|
||||
{
|
||||
V t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
|
||||
t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3);
|
||||
t2 = VBLEND(t6, t7);
|
||||
t3 = VBLEND(t7, t6);
|
||||
t4 = VADD(t0, t1);
|
||||
t5 = VSUB(t0, t1);
|
||||
t6 = VADD(t2, t3);
|
||||
t7 = VSUB(t2, t3);
|
||||
*r2 = VUNPACKHI(t4, t5);
|
||||
*r3 = VUNPACKHI(t6, t7);
|
||||
t7 = IMULI(inv, t7);
|
||||
t0 = VADD(t4, t6);
|
||||
t2 = VSUB(t4, t6);
|
||||
t1 = VSUB(t5, t7);
|
||||
t3 = VADD(t5, t7);
|
||||
*r0 = VUNPACKLO(t0, t1);
|
||||
*r1 = VUNPACKLO(t2, t3);
|
||||
}
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
66
3rdparty/ffts/ffts-master/src/neon.h
vendored
Normal file
66
3rdparty/ffts/ffts-master/src/neon.h
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __NEON_H__
|
||||
#define __NEON_H__
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
void neon_x4(float *, size_t, float *);
|
||||
void neon_x8(float *, size_t, float *);
|
||||
void neon_x8_t(float *, size_t, float *);
|
||||
void neon_ee();
|
||||
void neon_oo();
|
||||
void neon_eo();
|
||||
void neon_oe();
|
||||
void neon_end();
|
||||
|
||||
void neon_transpose(uint64_t *in, uint64_t *out, int w, int h);
|
||||
void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w);
|
||||
|
||||
//typedef struct _ffts_plan_t ffts_plan_t;
|
||||
|
||||
void neon_static_e_f(ffts_plan_t * , const void * , void * );
|
||||
void neon_static_o_f(ffts_plan_t * , const void * , void * );
|
||||
void neon_static_x4_f(float *, size_t, float *);
|
||||
void neon_static_x8_f(float *, size_t, float *);
|
||||
void neon_static_x8_t_f(float *, size_t, float *);
|
||||
|
||||
void neon_static_e_i(ffts_plan_t * , const void * , void * );
|
||||
void neon_static_o_i(ffts_plan_t * , const void * , void * );
|
||||
void neon_static_x4_i(float *, size_t, float *);
|
||||
void neon_static_x8_i(float *, size_t, float *);
|
||||
void neon_static_x8_t_i(float *, size_t, float *);
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
738
3rdparty/ffts/ffts-master/src/neon.s
vendored
Normal file
738
3rdparty/ffts/ffts-master/src/neon.s
vendored
Normal file
@@ -0,0 +1,738 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_x4
|
||||
_neon_x4:
|
||||
#else
|
||||
.globl neon_x4
|
||||
neon_x4:
|
||||
#endif
|
||||
@ add r3, r0, #0
|
||||
|
||||
vld1.32 {q8,q9}, [r0, :128]
|
||||
add r4, r0, r1, lsl #1
|
||||
vld1.32 {q10,q11}, [r4, :128]
|
||||
add r5, r0, r1, lsl #2
|
||||
vld1.32 {q12,q13}, [r5, :128]
|
||||
add r6, r4, r1, lsl #2
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q2,q3}, [r2, :128]
|
||||
|
||||
vmul.f32 q0, q13, q3
|
||||
vmul.f32 q5, q12, q2
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q4, q14, q3
|
||||
vmul.f32 q14, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vmul.f32 q12, q15, q3
|
||||
vmul.f32 q2, q15, q2
|
||||
vsub.f32 q0, q5, q0
|
||||
vadd.f32 q13, q13, q14
|
||||
vadd.f32 q12, q12, q1
|
||||
vsub.f32 q1, q2, q4
|
||||
vadd.f32 q15, q0, q12
|
||||
vsub.f32 q12, q0, q12
|
||||
vadd.f32 q14, q13, q1
|
||||
vsub.f32 q13, q13, q1
|
||||
vadd.f32 q0, q8, q15
|
||||
vadd.f32 q1, q9, q14
|
||||
vsub.f32 q2, q10, q13 @
|
||||
vsub.f32 q4, q8, q15
|
||||
vadd.f32 q3, q11, q12 @
|
||||
vst1.32 {q0,q1}, [r0, :128]
|
||||
vsub.f32 q5, q9, q14
|
||||
vadd.f32 q6, q10, q13 @
|
||||
vsub.f32 q7, q11, q12 @
|
||||
vst1.32 {q2,q3}, [r4, :128]
|
||||
vst1.32 {q4,q5}, [r5, :128]
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
bx lr
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_x8
|
||||
_neon_x8:
|
||||
#else
|
||||
.globl neon_x8
|
||||
neon_x8:
|
||||
#endif
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #5
|
||||
neon_x8_loop:
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q10,q11}, [r5, :128]
|
||||
adds r11, r11, #1
|
||||
vmul.f32 q12, q15, q2
|
||||
vmul.f32 q8, q14, q3
|
||||
vmul.f32 q13, q14, q2
|
||||
vmul.f32 q9, q10, q3
|
||||
vmul.f32 q1, q10, q2
|
||||
vmul.f32 q0, q11, q2
|
||||
vmul.f32 q14, q11, q3
|
||||
vmul.f32 q15, q15, q3
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q10, q12, q8
|
||||
vadd.f32 q11, q0, q9
|
||||
vadd.f32 q8, q15, q13
|
||||
vld1.32 {q12,q13}, [r4, :128]
|
||||
vsub.f32 q9, q1, q14
|
||||
vsub.f32 q15, q11, q10
|
||||
vsub.f32 q14, q9, q8
|
||||
vsub.f32 q4, q12, q15 @
|
||||
vadd.f32 q6, q12, q15 @
|
||||
vadd.f32 q5, q13, q14 @
|
||||
vsub.f32 q7, q13, q14 @
|
||||
vld1.32 {q14,q15}, [r9, :128]
|
||||
vld1.32 {q12,q13}, [r7, :128]
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q0, q14, q3
|
||||
vst1.32 {q4,q5}, [r4, :128]
|
||||
vmul.f32 q14, q15, q3
|
||||
vmul.f32 q4, q15, q2
|
||||
vadd.f32 q15, q9, q8
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vmul.f32 q8, q12, q3
|
||||
vmul.f32 q5, q13, q3
|
||||
vmul.f32 q12, q12, q2
|
||||
vmul.f32 q9, q13, q2
|
||||
vadd.f32 q14, q14, q1
|
||||
vsub.f32 q13, q4, q0
|
||||
vadd.f32 q0, q9, q8
|
||||
vld1.32 {q8,q9}, [r3, :128]
|
||||
vadd.f32 q1, q11, q10
|
||||
vsub.f32 q12, q12, q5
|
||||
vadd.f32 q11, q8, q15
|
||||
vsub.f32 q8, q8, q15
|
||||
vadd.f32 q2, q12, q14
|
||||
vsub.f32 q10, q0, q13
|
||||
vadd.f32 q15, q0, q13
|
||||
vadd.f32 q13, q9, q1
|
||||
vsub.f32 q9, q9, q1
|
||||
vsub.f32 q12, q12, q14
|
||||
vadd.f32 q0, q11, q2
|
||||
vadd.f32 q1, q13, q15
|
||||
vsub.f32 q4, q11, q2
|
||||
vsub.f32 q2, q8, q10 @
|
||||
vadd.f32 q3, q9, q12 @
|
||||
vst1.32 {q0,q1}, [r3, :128]!
|
||||
vsub.f32 q5, q13, q15
|
||||
vld1.32 {q14,q15}, [r10, :128]
|
||||
vsub.f32 q7, q9, q12 @
|
||||
vld1.32 {q12,q13}, [r8, :128]
|
||||
vst1.32 {q2,q3}, [r5, :128]!
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vadd.f32 q6, q8, q10 @
|
||||
vmul.f32 q8, q14, q2
|
||||
vst1.32 {q4,q5}, [r7, :128]!
|
||||
vmul.f32 q10, q15, q3
|
||||
vmul.f32 q9, q13, q3
|
||||
vmul.f32 q11, q12, q2
|
||||
vmul.f32 q14, q14, q3
|
||||
vst1.32 {q6,q7}, [r9, :128]!
|
||||
vmul.f32 q15, q15, q2
|
||||
vmul.f32 q12, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vadd.f32 q10, q10, q8
|
||||
vsub.f32 q11, q11, q9
|
||||
vld1.32 {q8,q9}, [r4, :128]
|
||||
vsub.f32 q14, q15, q14
|
||||
vadd.f32 q15, q13, q12
|
||||
vadd.f32 q13, q11, q10
|
||||
vadd.f32 q12, q15, q14
|
||||
vsub.f32 q15, q15, q14
|
||||
vsub.f32 q14, q11, q10
|
||||
vld1.32 {q10,q11}, [r6, :128]
|
||||
vadd.f32 q0, q8, q13
|
||||
vadd.f32 q1, q9, q12
|
||||
vsub.f32 q2, q10, q15 @
|
||||
vadd.f32 q3, q11, q14 @
|
||||
vsub.f32 q4, q8, q13
|
||||
vst1.32 {q0,q1}, [r4, :128]!
|
||||
vsub.f32 q5, q9, q12
|
||||
vadd.f32 q6, q10, q15 @
|
||||
vst1.32 {q2,q3}, [r6, :128]!
|
||||
vsub.f32 q7, q11, q14 @
|
||||
vst1.32 {q4,q5}, [r8, :128]!
|
||||
vst1.32 {q6,q7}, [r10, :128]!
|
||||
bne neon_x8_loop
|
||||
|
||||
bx lr
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_x8_t
|
||||
_neon_x8_t:
|
||||
#else
|
||||
.globl neon_x8_t
|
||||
neon_x8_t:
|
||||
#endif
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #5
|
||||
neon_x8_t_loop:
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q10,q11}, [r5, :128]
|
||||
adds r11, r11, #1
|
||||
vmul.f32 q12, q15, q2
|
||||
vmul.f32 q8, q14, q3
|
||||
vmul.f32 q13, q14, q2
|
||||
vmul.f32 q9, q10, q3
|
||||
vmul.f32 q1, q10, q2
|
||||
vmul.f32 q0, q11, q2
|
||||
vmul.f32 q14, q11, q3
|
||||
vmul.f32 q15, q15, q3
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q10, q12, q8
|
||||
vadd.f32 q11, q0, q9
|
||||
vadd.f32 q8, q15, q13
|
||||
vld1.32 {q12,q13}, [r4, :128]
|
||||
vsub.f32 q9, q1, q14
|
||||
vsub.f32 q15, q11, q10
|
||||
vsub.f32 q14, q9, q8
|
||||
vsub.f32 q4, q12, q15 @
|
||||
vadd.f32 q6, q12, q15 @
|
||||
vadd.f32 q5, q13, q14 @
|
||||
vsub.f32 q7, q13, q14 @
|
||||
vld1.32 {q14,q15}, [r9, :128]
|
||||
vld1.32 {q12,q13}, [r7, :128]
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q0, q14, q3
|
||||
vst1.32 {q4,q5}, [r4, :128]
|
||||
vmul.f32 q14, q15, q3
|
||||
vmul.f32 q4, q15, q2
|
||||
vadd.f32 q15, q9, q8
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vmul.f32 q8, q12, q3
|
||||
vmul.f32 q5, q13, q3
|
||||
vmul.f32 q12, q12, q2
|
||||
vmul.f32 q9, q13, q2
|
||||
vadd.f32 q14, q14, q1
|
||||
vsub.f32 q13, q4, q0
|
||||
vadd.f32 q0, q9, q8
|
||||
vld1.32 {q8,q9}, [r3, :128]
|
||||
vadd.f32 q1, q11, q10
|
||||
vsub.f32 q12, q12, q5
|
||||
vadd.f32 q11, q8, q15
|
||||
vsub.f32 q8, q8, q15
|
||||
vadd.f32 q2, q12, q14
|
||||
vsub.f32 q10, q0, q13
|
||||
vadd.f32 q15, q0, q13
|
||||
vadd.f32 q13, q9, q1
|
||||
vsub.f32 q9, q9, q1
|
||||
vsub.f32 q12, q12, q14
|
||||
vadd.f32 q0, q11, q2
|
||||
vadd.f32 q1, q13, q15
|
||||
vsub.f32 q4, q11, q2
|
||||
vsub.f32 q2, q8, q10 @
|
||||
vadd.f32 q3, q9, q12 @
|
||||
vst2.32 {q0,q1}, [r3, :128]!
|
||||
vsub.f32 q5, q13, q15
|
||||
vld1.32 {q14,q15}, [r10, :128]
|
||||
vsub.f32 q7, q9, q12 @
|
||||
vld1.32 {q12,q13}, [r8, :128]
|
||||
vst2.32 {q2,q3}, [r5, :128]!
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vadd.f32 q6, q8, q10 @
|
||||
vmul.f32 q8, q14, q2
|
||||
vst2.32 {q4,q5}, [r7, :128]!
|
||||
vmul.f32 q10, q15, q3
|
||||
vmul.f32 q9, q13, q3
|
||||
vmul.f32 q11, q12, q2
|
||||
vmul.f32 q14, q14, q3
|
||||
vst2.32 {q6,q7}, [r9, :128]!
|
||||
vmul.f32 q15, q15, q2
|
||||
vmul.f32 q12, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vadd.f32 q10, q10, q8
|
||||
vsub.f32 q11, q11, q9
|
||||
vld1.32 {q8,q9}, [r4, :128]
|
||||
vsub.f32 q14, q15, q14
|
||||
vadd.f32 q15, q13, q12
|
||||
vadd.f32 q13, q11, q10
|
||||
vadd.f32 q12, q15, q14
|
||||
vsub.f32 q15, q15, q14
|
||||
vsub.f32 q14, q11, q10
|
||||
vld1.32 {q10,q11}, [r6, :128]
|
||||
vadd.f32 q0, q8, q13
|
||||
vadd.f32 q1, q9, q12
|
||||
vsub.f32 q2, q10, q15 @
|
||||
vadd.f32 q3, q11, q14 @
|
||||
vsub.f32 q4, q8, q13
|
||||
vst2.32 {q0,q1}, [r4, :128]!
|
||||
vsub.f32 q5, q9, q12
|
||||
vadd.f32 q6, q10, q15 @
|
||||
vst2.32 {q2,q3}, [r6, :128]!
|
||||
vsub.f32 q7, q11, q14 @
|
||||
vst2.32 {q4,q5}, [r8, :128]!
|
||||
vst2.32 {q6,q7}, [r10, :128]!
|
||||
bne neon_x8_t_loop
|
||||
|
||||
@bx lr
|
||||
|
||||
@ assumes r0 = out
|
||||
@ r1 = in ?
|
||||
@
|
||||
@ r12 = offsets
|
||||
@ r3-r10 = data pointers
|
||||
@ r11 = loop iterations
|
||||
@ r2 & lr = temps
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_ee
|
||||
_neon_ee:
|
||||
#else
|
||||
.globl neon_ee
|
||||
neon_ee:
|
||||
#endif
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_loop:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vadd.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vsub.f32 d31, d5, d2 @
|
||||
vsub.f32 d28, d4, d3 @
|
||||
vadd.f32 d30, d4, d3 @
|
||||
vadd.f32 d5, d19, d14 @-
|
||||
vadd.f32 d7, d31, d26 @-
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vsub.f32 d6, d30, d27 @-
|
||||
vsub.f32 d4, d18, d15 @-
|
||||
vsub.f32 d13, d19, d14 @-
|
||||
vadd.f32 d12, d18, d15 @-
|
||||
vsub.f32 d15, d31, d26 @-
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vadd.f32 d14, d30, d27 @-
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_loop
|
||||
|
||||
@ assumes r0 = out
|
||||
@
|
||||
@ r12 = offsets
|
||||
@ r3-r10 = data pointers
|
||||
@ r11 = loop iterations
|
||||
@ r2 & lr = temps
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_oo
|
||||
_neon_oo:
|
||||
#else
|
||||
.globl neon_oo
|
||||
neon_oo:
|
||||
#endif
|
||||
_neon_oo_loop:
|
||||
vld2.32 {q8}, [r6, :128]!
|
||||
vld2.32 {q9}, [r5, :128]!
|
||||
vld2.32 {q10}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vadd.f32 q11, q9, q8
|
||||
vsub.f32 q8, q9, q8
|
||||
vsub.f32 q9, q13, q10
|
||||
vadd.f32 q12, q13, q10
|
||||
subs r11, r11, #1
|
||||
vld2.32 {q10}, [r7, :128]!
|
||||
vld2.32 {q13}, [r9, :128]!
|
||||
vsub.f32 q2, q12, q11
|
||||
vsub.f32 d7, d19, d16 @
|
||||
vadd.f32 d3, d19, d16 @
|
||||
vadd.f32 d6, d18, d17 @
|
||||
vsub.f32 d2, d18, d17 @
|
||||
vld2.32 {q9}, [r8, :128]!
|
||||
vld2.32 {q8}, [r10, :128]!
|
||||
vadd.f32 q0, q12, q11
|
||||
vadd.f32 q11, q13, q8
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 q8, q13, q8
|
||||
vsub.f32 q9, q10, q9
|
||||
vsub.f32 q6, q12, q11
|
||||
vadd.f32 q4, q12, q11
|
||||
vtrn.32 q0, q2
|
||||
ldr r2, [r12], #4
|
||||
vsub.f32 d15, d19, d16 @
|
||||
ldr lr, [r12], #4
|
||||
vadd.f32 d11, d19, d16 @
|
||||
vadd.f32 d14, d18, d17 @
|
||||
vsub.f32 d10, d18, d17 @
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q1, q3
|
||||
add lr, r0, lr, lsl #2
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_oo_loop
|
||||
|
||||
@ assumes r0 = out
|
||||
@
|
||||
@ r12 = offsets
|
||||
@ r3-r10 = data pointers
|
||||
@ r11 = addr of twiddle
|
||||
@ r2 & lr = temps
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_eo
|
||||
_neon_eo:
|
||||
#else
|
||||
.globl neon_eo
|
||||
neon_eo:
|
||||
#endif
|
||||
vld2.32 {q9}, [r5, :128]! @tag2
|
||||
vld2.32 {q13}, [r3, :128]! @tag0
|
||||
vld2.32 {q12}, [r4, :128]! @tag1
|
||||
vld2.32 {q0}, [r7, :128]! @tag4
|
||||
vsub.f32 q11, q13, q12
|
||||
vld2.32 {q8}, [r6, :128]! @tag3
|
||||
vadd.f32 q12, q13, q12
|
||||
vsub.f32 q10, q9, q8
|
||||
vadd.f32 q8, q9, q8
|
||||
vadd.f32 q9, q12, q8
|
||||
vadd.f32 d9, d23, d20 @
|
||||
vsub.f32 d11, d23, d20 @
|
||||
vsub.f32 q8, q12, q8
|
||||
vsub.f32 d8, d22, d21 @
|
||||
vadd.f32 d10, d22, d21 @
|
||||
ldr r2, [r12], #4
|
||||
vld1.32 {d20, d21}, [r11, :128]
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q9, q4
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q8, q5
|
||||
add lr, r0, lr, lsl #2
|
||||
vswp d9,d10
|
||||
vst1.32 {d8,d9,d10,d11}, [lr, :128]!
|
||||
vld2.32 {q13}, [r10, :128]! @tag7
|
||||
vld2.32 {q15}, [r9, :128]! @tag6
|
||||
vld2.32 {q11}, [r8, :128]! @tag5
|
||||
vsub.f32 q14, q15, q13
|
||||
vsub.f32 q12, q0, q11
|
||||
vadd.f32 q11, q0, q11
|
||||
vadd.f32 q13, q15, q13
|
||||
vadd.f32 d13, d29, d24 @
|
||||
vadd.f32 q15, q13, q11
|
||||
vsub.f32 d12, d28, d25 @
|
||||
vsub.f32 d15, d29, d24 @
|
||||
vadd.f32 d14, d28, d25 @
|
||||
vtrn.32 q15, q6
|
||||
vsub.f32 q15, q13, q11
|
||||
vtrn.32 q15, q7
|
||||
vswp d13, d14
|
||||
vst1.32 {d12,d13,d14,d15}, [lr, :128]!
|
||||
vtrn.32 q13, q14
|
||||
vtrn.32 q11, q12
|
||||
vmul.f32 d24, d26, d21
|
||||
vmul.f32 d28, d27, d20
|
||||
vmul.f32 d25, d26, d20
|
||||
vmul.f32 d26, d27, d21
|
||||
vmul.f32 d27, d22, d21
|
||||
vmul.f32 d30, d23, d20
|
||||
vmul.f32 d29, d23, d21
|
||||
vmul.f32 d22, d22, d20
|
||||
vsub.f32 d21, d28, d24
|
||||
vadd.f32 d20, d26, d25
|
||||
vadd.f32 d25, d30, d27
|
||||
vsub.f32 d24, d22, d29
|
||||
vadd.f32 q11, q12, q10
|
||||
vsub.f32 q10, q12, q10
|
||||
vadd.f32 q0, q9, q11
|
||||
vsub.f32 q2, q9, q11
|
||||
vadd.f32 d3, d17, d20 @
|
||||
vsub.f32 d7, d17, d20 @
|
||||
vsub.f32 d2, d16, d21 @
|
||||
vadd.f32 d6, d16, d21 @
|
||||
vswp d1, d2
|
||||
vswp d5, d6
|
||||
vstmia r2!, {q0-q3}
|
||||
|
||||
|
||||
@ assumes r0 = out
|
||||
@
|
||||
@ r12 = offsets
|
||||
@ r3-r10 = data pointers
|
||||
@ r11 = addr of twiddle
|
||||
@ r2 & lr = temps
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_oe
|
||||
_neon_oe:
|
||||
#else
|
||||
.globl neon_oe
|
||||
neon_oe:
|
||||
#endif
|
||||
vld1.32 {q8}, [r5, :128]!
|
||||
vld1.32 {q10}, [r6, :128]!
|
||||
vld2.32 {q11}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vorr d25, d17, d17
|
||||
vorr d24, d20, d20
|
||||
vorr d20, d16, d16
|
||||
vsub.f32 q9, q13, q11
|
||||
vadd.f32 q11, q13, q11
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 d24, d25
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 d20, d21
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q8, q10, q12
|
||||
add lr, r0, lr, lsl #2
|
||||
vadd.f32 q10, q10, q12
|
||||
vadd.f32 q0, q11, q10
|
||||
vadd.f32 d25, d19, d16 @
|
||||
vsub.f32 d27, d19, d16 @
|
||||
vsub.f32 q1, q11, q10
|
||||
vsub.f32 d24, d18, d17 @
|
||||
vadd.f32 d26, d18, d17 @
|
||||
vtrn.32 q0, q12
|
||||
vtrn.32 q1, q13
|
||||
vld1.32 {d24, d25}, [r11, :128]
|
||||
vswp d1, d2
|
||||
vst1.32 {q0, q1}, [r2, :128]!
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
vadd.f32 q1, q0, q15
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vsub.f32 q15, q0, q15
|
||||
vsub.f32 q0, q14, q13
|
||||
vadd.f32 q3, q14, q13
|
||||
vadd.f32 q2, q3, q1
|
||||
vadd.f32 d29, d1, d30 @
|
||||
vsub.f32 d27, d1, d30 @
|
||||
vsub.f32 q3, q3, q1
|
||||
vsub.f32 d28, d0, d31 @
|
||||
vadd.f32 d26, d0, d31 @
|
||||
vtrn.32 q2, q14
|
||||
vtrn.32 q3, q13
|
||||
vswp d5, d6
|
||||
vst1.32 {q2, q3}, [r2, :128]!
|
||||
vtrn.32 q11, q9
|
||||
vtrn.32 q10, q8
|
||||
vmul.f32 d20, d18, d25
|
||||
vmul.f32 d22, d19, d24
|
||||
vmul.f32 d21, d19, d25
|
||||
vmul.f32 d18, d18, d24
|
||||
vmul.f32 d19, d16, d25
|
||||
vmul.f32 d30, d17, d24
|
||||
vmul.f32 d23, d16, d24
|
||||
vmul.f32 d24, d17, d25
|
||||
vadd.f32 d17, d22, d20
|
||||
vsub.f32 d16, d18, d21
|
||||
vsub.f32 d21, d30, d19
|
||||
vadd.f32 d20, d24, d23
|
||||
vadd.f32 q9, q8, q10
|
||||
vsub.f32 q8, q8, q10
|
||||
vadd.f32 q4, q14, q9
|
||||
vsub.f32 q6, q14, q9
|
||||
vadd.f32 d11, d27, d16 @
|
||||
vsub.f32 d15, d27, d16 @
|
||||
vsub.f32 d10, d26, d17 @
|
||||
vadd.f32 d14, d26, d17 @
|
||||
vswp d9, d10
|
||||
vswp d13, d14
|
||||
vstmia lr!, {q4-q7}
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_end
|
||||
_neon_end:
|
||||
#else
|
||||
.globl neon_end
|
||||
neon_end:
|
||||
#endif
|
||||
bx lr
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_transpose
|
||||
_neon_transpose:
|
||||
#else
|
||||
.globl neon_transpose
|
||||
neon_transpose:
|
||||
#endif
|
||||
push {r4-r8}
|
||||
@ vpush {q8-q9}
|
||||
mov r5, r3
|
||||
_neon_transpose_col:
|
||||
mov r7, r1
|
||||
add r8, r1, r3, lsl #3
|
||||
mov r4, r2
|
||||
add r6, r0, r2, lsl #3
|
||||
_neon_transpose_row:
|
||||
vld1.32 {q8,q9}, [r0, :128]!
|
||||
@ vld1.32 {q10,q11}, [r0, :128]!
|
||||
vld1.32 {q12,q13}, [r6, :128]!
|
||||
@ vld1.32 {q14,q15}, [r6, :128]!
|
||||
sub r4, r4, #4
|
||||
cmp r4, #0
|
||||
vswp d17,d24
|
||||
vswp d19,d26
|
||||
vswp d21,d28
|
||||
vswp d23,d30
|
||||
vst1.32 {q8}, [r7, :128]
|
||||
vst1.32 {q12}, [r8, :128]
|
||||
add r7, r7, r3, lsl #4
|
||||
add r8, r8, r3, lsl #4
|
||||
vst1.32 {q9}, [r7, :128]
|
||||
vst1.32 {q13}, [r8, :128]
|
||||
add r7, r7, r3, lsl #4
|
||||
add r8, r8, r3, lsl #4
|
||||
@@vst1.32 {q10}, [r7, :128]
|
||||
@@vst1.32 {q14}, [r8, :128]
|
||||
@@add r7, r7, r3, lsl #4
|
||||
@@add r8, r8, r3, lsl #4
|
||||
@@vst1.32 {q11}, [r7, :128]
|
||||
@@vst1.32 {q15}, [r8, :128]
|
||||
@@add r7, r7, r3, lsl #4
|
||||
@@add r8, r8, r3, lsl #4
|
||||
bne _neon_transpose_row
|
||||
sub r5, r5, #2
|
||||
cmp r5, #0
|
||||
add r0, r0, r2, lsl #3
|
||||
add r1, r1, #16
|
||||
bne _neon_transpose_col
|
||||
@ vpop {q8-q9}
|
||||
pop {r4-r8}
|
||||
bx lr
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_transpose_to_buf
|
||||
_neon_transpose_to_buf:
|
||||
#else
|
||||
.globl neon_transpose_to_buf
|
||||
neon_transpose_to_buf:
|
||||
#endif
|
||||
push {r4-r10}
|
||||
mov r5, #8
|
||||
_neon_transpose_to_buf_col:
|
||||
mov r4, #8
|
||||
add r6, r0, r2, lsl #3
|
||||
mov r7, r1
|
||||
add r8, r1, #64
|
||||
add r9, r1, #128
|
||||
add r10, r1, #192
|
||||
_neon_transpose_to_buf_row:
|
||||
vld1.32 {q8,q9}, [r0, :128]!
|
||||
vld1.32 {q12,q13}, [r6, :128]!
|
||||
sub r4, r4, #4
|
||||
cmp r4, #0
|
||||
vswp d17,d24
|
||||
vswp d19,d26
|
||||
vst1.32 {q8}, [r7, :128]
|
||||
vst1.32 {q12}, [r8, :128]
|
||||
vst1.32 {q9}, [r9, :128]
|
||||
vst1.32 {q13}, [r10, :128]
|
||||
add r7, r7, #256
|
||||
add r8, r8, #256
|
||||
add r9, r9, #256
|
||||
add r10, r10, #256
|
||||
bne _neon_transpose_to_buf_row
|
||||
sub r5, r5, #2
|
||||
cmp r5, #0
|
||||
sub r0, r0, #64
|
||||
add r0, r0, r2, lsl #4
|
||||
add r1, r1, #16
|
||||
bne _neon_transpose_to_buf_col
|
||||
pop {r4-r10}
|
||||
bx lr
|
||||
1127
3rdparty/ffts/ffts-master/src/neon_float.h
vendored
Normal file
1127
3rdparty/ffts/ffts-master/src/neon_float.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
956
3rdparty/ffts/ffts-master/src/neon_static_f.s
vendored
Normal file
956
3rdparty/ffts/ffts-master/src/neon_static_f.s
vendored
Normal file
@@ -0,0 +1,956 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_e_f
|
||||
_neon_static_e_f:
|
||||
#else
|
||||
.globl neon_static_e_f
|
||||
neon_static_e_f:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
ldr lr, [r0, #40] @ this is p->N
|
||||
add r3, r1, #0
|
||||
add r7, r1, lr
|
||||
add r5, r7, lr
|
||||
add r10, r5, lr
|
||||
add r4, r10, lr
|
||||
add r8, r4, lr
|
||||
add r6, r8, lr
|
||||
add r9, r6, lr
|
||||
ldr r12, [r0]
|
||||
add r1, r0, #0
|
||||
add r0, r2, #0
|
||||
ldr r2, [r1, #16] @ this is p->ee_ws
|
||||
ldr r11, [r1, #28] @ this is p->i0
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_loop:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vsub.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 d31, d5, d2 @
|
||||
vadd.f32 d28, d4, d3 @
|
||||
vsub.f32 d30, d4, d3 @
|
||||
vsub.f32 d5, d19, d14 @
|
||||
vsub.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vadd.f32 d6, d30, d27 @
|
||||
vadd.f32 d4, d18, d15 @
|
||||
vadd.f32 d13, d19, d14 @
|
||||
vsub.f32 d12, d18, d15 @
|
||||
vadd.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vsub.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_loop
|
||||
|
||||
ldr r11, [r1, #12]
|
||||
vld2.32 {q9}, [r5, :128]! @tag2
|
||||
vld2.32 {q13}, [r3, :128]! @tag0
|
||||
vld2.32 {q12}, [r4, :128]! @tag1
|
||||
vld2.32 {q0}, [r7, :128]! @tag4
|
||||
vsub.f32 q11, q13, q12
|
||||
vld2.32 {q8}, [r6, :128]! @tag3
|
||||
vadd.f32 q12, q13, q12
|
||||
vsub.f32 q10, q9, q8
|
||||
vadd.f32 q8, q9, q8
|
||||
vadd.f32 q9, q12, q8
|
||||
vsub.f32 d9, d23, d20 @
|
||||
vadd.f32 d11, d23, d20 @
|
||||
vsub.f32 q8, q12, q8
|
||||
vadd.f32 d8, d22, d21 @
|
||||
vsub.f32 d10, d22, d21 @
|
||||
ldr r2, [r12], #4
|
||||
vld1.32 {d20, d21}, [r11, :128]
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q9, q4
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q8, q5
|
||||
add lr, r0, lr, lsl #2
|
||||
vswp d9,d10
|
||||
vst1.32 {d8,d9,d10,d11}, [lr, :128]!
|
||||
vld2.32 {q13}, [r10, :128]! @tag7
|
||||
vld2.32 {q15}, [r9, :128]! @tag6
|
||||
vld2.32 {q11}, [r8, :128]! @tag5
|
||||
vsub.f32 q14, q15, q13
|
||||
vsub.f32 q12, q0, q11
|
||||
vadd.f32 q11, q0, q11
|
||||
vadd.f32 q13, q15, q13
|
||||
vsub.f32 d13, d29, d24 @
|
||||
vadd.f32 q15, q13, q11
|
||||
vadd.f32 d12, d28, d25 @
|
||||
vadd.f32 d15, d29, d24 @
|
||||
vsub.f32 d14, d28, d25 @
|
||||
vtrn.32 q15, q6
|
||||
vsub.f32 q15, q13, q11
|
||||
vtrn.32 q15, q7
|
||||
vswp d13, d14
|
||||
vst1.32 {d12,d13,d14,d15}, [lr, :128]!
|
||||
vtrn.32 q13, q14
|
||||
vtrn.32 q11, q12
|
||||
vmul.f32 d24, d26, d21
|
||||
vmul.f32 d28, d27, d20
|
||||
vmul.f32 d25, d26, d20
|
||||
vmul.f32 d26, d27, d21
|
||||
vmul.f32 d27, d22, d21
|
||||
vmul.f32 d30, d23, d20
|
||||
vmul.f32 d29, d23, d21
|
||||
vmul.f32 d22, d22, d20
|
||||
vsub.f32 d21, d28, d24
|
||||
vadd.f32 d20, d26, d25
|
||||
vadd.f32 d25, d30, d27
|
||||
vsub.f32 d24, d22, d29
|
||||
vadd.f32 q11, q12, q10
|
||||
vsub.f32 q10, q12, q10
|
||||
vadd.f32 q0, q9, q11
|
||||
vsub.f32 q2, q9, q11
|
||||
vsub.f32 d3, d17, d20 @
|
||||
vadd.f32 d7, d17, d20 @
|
||||
vadd.f32 d2, d16, d21 @
|
||||
vsub.f32 d6, d16, d21 @
|
||||
vswp d1, d2
|
||||
vswp d5, d6
|
||||
vstmia r2!, {q0-q3}
|
||||
|
||||
add r2, r7, #0
|
||||
add r7, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r8, #0
|
||||
add r8, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_oo_loop_exit
|
||||
_neon_oo_loop:
|
||||
vld2.32 {q8}, [r6, :128]!
|
||||
vld2.32 {q9}, [r5, :128]!
|
||||
vld2.32 {q10}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vadd.f32 q11, q9, q8
|
||||
vsub.f32 q8, q9, q8
|
||||
vsub.f32 q9, q13, q10
|
||||
vadd.f32 q12, q13, q10
|
||||
subs r11, r11, #1
|
||||
vld2.32 {q10}, [r7, :128]!
|
||||
vld2.32 {q13}, [r9, :128]!
|
||||
vsub.f32 q2, q12, q11
|
||||
vadd.f32 d7, d19, d16 @
|
||||
vsub.f32 d3, d19, d16 @
|
||||
vsub.f32 d6, d18, d17 @
|
||||
vadd.f32 d2, d18, d17 @
|
||||
vld2.32 {q9}, [r8, :128]!
|
||||
vld2.32 {q8}, [r10, :128]!
|
||||
vadd.f32 q0, q12, q11
|
||||
vadd.f32 q11, q13, q8
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 q8, q13, q8
|
||||
vsub.f32 q9, q10, q9
|
||||
vsub.f32 q6, q12, q11
|
||||
vadd.f32 q4, q12, q11
|
||||
vtrn.32 q0, q2
|
||||
ldr r2, [r12], #4
|
||||
vadd.f32 d15, d19, d16 @
|
||||
ldr lr, [r12], #4
|
||||
vsub.f32 d11, d19, d16 @
|
||||
vsub.f32 d14, d18, d17 @
|
||||
vadd.f32 d10, d18, d17 @
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q1, q3
|
||||
add lr, r0, lr, lsl #2
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_oo_loop
|
||||
_neon_oo_loop_exit:
|
||||
|
||||
|
||||
add r2, r3, #0
|
||||
add r3, r7, #0
|
||||
add r7, r2, #0
|
||||
add r2, r4, #0
|
||||
add r4, r8, #0
|
||||
add r8, r2, #0
|
||||
add r2, r5, #0
|
||||
add r5, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r6, #0
|
||||
add r6, r10, #0
|
||||
add r10, r2, #0
|
||||
add r2, r9, #0
|
||||
add r9, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r2, [r1, #16]
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_ee_loop2_exit
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_loop2:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vsub.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 d31, d5, d2 @
|
||||
vadd.f32 d28, d4, d3 @
|
||||
vsub.f32 d30, d4, d3 @
|
||||
vsub.f32 d5, d19, d14 @
|
||||
vsub.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vadd.f32 d6, d30, d27 @
|
||||
vadd.f32 d4, d18, d15 @
|
||||
vadd.f32 d13, d19, d14 @
|
||||
vsub.f32 d12, d18, d15 @
|
||||
vadd.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vsub.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_loop2
|
||||
_neon_ee_loop2_exit:
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_o_f
|
||||
_neon_static_o_f:
|
||||
#else
|
||||
.globl neon_static_o_f
|
||||
neon_static_o_f:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
ldr lr, [r0, #40] @ this is p->N
|
||||
add r3, r1, #0
|
||||
add r7, r1, lr
|
||||
add r5, r7, lr
|
||||
add r10, r5, lr
|
||||
add r4, r10, lr
|
||||
add r8, r4, lr
|
||||
add r6, r8, lr
|
||||
add r9, r6, lr
|
||||
ldr r12, [r0]
|
||||
add r1, r0, #0
|
||||
add r0, r2, #0
|
||||
ldr r2, [r1, #16] @ this is p->ee_ws
|
||||
ldr r11, [r1, #28] @ this is p->i0
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_o_loop:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vsub.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 d31, d5, d2 @
|
||||
vadd.f32 d28, d4, d3 @
|
||||
vsub.f32 d30, d4, d3 @
|
||||
vsub.f32 d5, d19, d14 @
|
||||
vsub.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vadd.f32 d6, d30, d27 @
|
||||
vadd.f32 d4, d18, d15 @
|
||||
vadd.f32 d13, d19, d14 @
|
||||
vsub.f32 d12, d18, d15 @
|
||||
vadd.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vsub.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_o_loop
|
||||
|
||||
add r2, r7, #0
|
||||
add r7, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r8, #0
|
||||
add r8, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_oo_o_loop_exit
|
||||
_neon_oo_o_loop:
|
||||
vld2.32 {q8}, [r6, :128]!
|
||||
vld2.32 {q9}, [r5, :128]!
|
||||
vld2.32 {q10}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vadd.f32 q11, q9, q8
|
||||
vsub.f32 q8, q9, q8
|
||||
vsub.f32 q9, q13, q10
|
||||
vadd.f32 q12, q13, q10
|
||||
subs r11, r11, #1
|
||||
vld2.32 {q10}, [r7, :128]!
|
||||
vld2.32 {q13}, [r9, :128]!
|
||||
vsub.f32 q2, q12, q11
|
||||
vadd.f32 d7, d19, d16 @
|
||||
vsub.f32 d3, d19, d16 @
|
||||
vsub.f32 d6, d18, d17 @
|
||||
vadd.f32 d2, d18, d17 @
|
||||
vld2.32 {q9}, [r8, :128]!
|
||||
vld2.32 {q8}, [r10, :128]!
|
||||
vadd.f32 q0, q12, q11
|
||||
vadd.f32 q11, q13, q8
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 q8, q13, q8
|
||||
vsub.f32 q9, q10, q9
|
||||
vsub.f32 q6, q12, q11
|
||||
vadd.f32 q4, q12, q11
|
||||
vtrn.32 q0, q2
|
||||
ldr r2, [r12], #4
|
||||
vadd.f32 d15, d19, d16 @
|
||||
ldr lr, [r12], #4
|
||||
vsub.f32 d11, d19, d16 @
|
||||
vsub.f32 d14, d18, d17 @
|
||||
vadd.f32 d10, d18, d17 @
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q1, q3
|
||||
add lr, r0, lr, lsl #2
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_oo_o_loop
|
||||
_neon_oo_o_loop_exit:
|
||||
|
||||
ldr r11, [r1, #8]
|
||||
vld1.32 {q8}, [r5, :128]!
|
||||
vld1.32 {q10}, [r6, :128]!
|
||||
vld2.32 {q11}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vorr d25, d17, d17
|
||||
vorr d24, d20, d20
|
||||
vorr d20, d16, d16
|
||||
vsub.f32 q9, q13, q11
|
||||
vadd.f32 q11, q13, q11
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 d24, d25
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 d20, d21
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q8, q10, q12
|
||||
add lr, r0, lr, lsl #2
|
||||
vadd.f32 q10, q10, q12
|
||||
vadd.f32 q0, q11, q10
|
||||
vsub.f32 d25, d19, d16 @
|
||||
vadd.f32 d27, d19, d16 @
|
||||
vsub.f32 q1, q11, q10
|
||||
vadd.f32 d24, d18, d17 @
|
||||
vsub.f32 d26, d18, d17 @
|
||||
vtrn.32 q0, q12
|
||||
vtrn.32 q1, q13
|
||||
vld1.32 {d24, d25}, [r11, :128]
|
||||
vswp d1, d2
|
||||
vst1.32 {q0, q1}, [r2, :128]!
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
vadd.f32 q1, q0, q15
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vsub.f32 q15, q0, q15
|
||||
vsub.f32 q0, q14, q13
|
||||
vadd.f32 q3, q14, q13
|
||||
vadd.f32 q2, q3, q1
|
||||
vsub.f32 d29, d1, d30 @
|
||||
vadd.f32 d27, d1, d30 @
|
||||
vsub.f32 q3, q3, q1
|
||||
vadd.f32 d28, d0, d31 @
|
||||
vsub.f32 d26, d0, d31 @
|
||||
vtrn.32 q2, q14
|
||||
vtrn.32 q3, q13
|
||||
vswp d5, d6
|
||||
vst1.32 {q2, q3}, [r2, :128]!
|
||||
vtrn.32 q11, q9
|
||||
vtrn.32 q10, q8
|
||||
vmul.f32 d20, d18, d25
|
||||
vmul.f32 d22, d19, d24
|
||||
vmul.f32 d21, d19, d25
|
||||
vmul.f32 d18, d18, d24
|
||||
vmul.f32 d19, d16, d25
|
||||
vmul.f32 d30, d17, d24
|
||||
vmul.f32 d23, d16, d24
|
||||
vmul.f32 d24, d17, d25
|
||||
vadd.f32 d17, d22, d20
|
||||
vsub.f32 d16, d18, d21
|
||||
vsub.f32 d21, d30, d19
|
||||
vadd.f32 d20, d24, d23
|
||||
vadd.f32 q9, q8, q10
|
||||
vsub.f32 q8, q8, q10
|
||||
vadd.f32 q4, q14, q9
|
||||
vsub.f32 q6, q14, q9
|
||||
vsub.f32 d11, d27, d16 @
|
||||
vadd.f32 d15, d27, d16 @
|
||||
vadd.f32 d10, d26, d17 @
|
||||
vsub.f32 d14, d26, d17 @
|
||||
vswp d9, d10
|
||||
vswp d13, d14
|
||||
vstmia lr!, {q4-q7}
|
||||
|
||||
|
||||
add r2, r3, #0
|
||||
add r3, r7, #0
|
||||
add r7, r2, #0
|
||||
add r2, r4, #0
|
||||
add r4, r8, #0
|
||||
add r8, r2, #0
|
||||
add r2, r5, #0
|
||||
add r5, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r6, #0
|
||||
add r6, r10, #0
|
||||
add r10, r2, #0
|
||||
add r2, r9, #0
|
||||
add r9, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r2, [r1, #16]
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_ee_o_loop2_exit
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_o_loop2:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vsub.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 d31, d5, d2 @
|
||||
vadd.f32 d28, d4, d3 @
|
||||
vsub.f32 d30, d4, d3 @
|
||||
vsub.f32 d5, d19, d14 @
|
||||
vsub.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vadd.f32 d6, d30, d27 @
|
||||
vadd.f32 d4, d18, d15 @
|
||||
vadd.f32 d13, d19, d14 @
|
||||
vsub.f32 d12, d18, d15 @
|
||||
vadd.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vsub.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_o_loop2
|
||||
_neon_ee_o_loop2_exit:
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_x4_f
|
||||
_neon_static_x4_f:
|
||||
#else
|
||||
.globl neon_static_x4_f
|
||||
neon_static_x4_f:
|
||||
#endif
|
||||
@ add r3, r0, #0
|
||||
push {r4, r5, r6, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
|
||||
vld1.32 {q8,q9}, [r0, :128]
|
||||
add r4, r0, r1, lsl #1
|
||||
vld1.32 {q10,q11}, [r4, :128]
|
||||
add r5, r0, r1, lsl #2
|
||||
vld1.32 {q12,q13}, [r5, :128]
|
||||
add r6, r4, r1, lsl #2
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q2,q3}, [r2, :128]
|
||||
|
||||
vmul.f32 q0, q13, q3
|
||||
vmul.f32 q5, q12, q2
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q4, q14, q3
|
||||
vmul.f32 q14, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vmul.f32 q12, q15, q3
|
||||
vmul.f32 q2, q15, q2
|
||||
vsub.f32 q0, q5, q0
|
||||
vadd.f32 q13, q13, q14
|
||||
vadd.f32 q12, q12, q1
|
||||
vsub.f32 q1, q2, q4
|
||||
vadd.f32 q15, q0, q12
|
||||
vsub.f32 q12, q0, q12
|
||||
vadd.f32 q14, q13, q1
|
||||
vsub.f32 q13, q13, q1
|
||||
vadd.f32 q0, q8, q15
|
||||
vadd.f32 q1, q9, q14
|
||||
vadd.f32 q2, q10, q13 @
|
||||
vsub.f32 q4, q8, q15
|
||||
vsub.f32 q3, q11, q12 @
|
||||
vst1.32 {q0,q1}, [r0, :128]
|
||||
vsub.f32 q5, q9, q14
|
||||
vsub.f32 q6, q10, q13 @
|
||||
vadd.f32 q7, q11, q12 @
|
||||
vst1.32 {q2,q3}, [r4, :128]
|
||||
vst1.32 {q4,q5}, [r5, :128]
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, pc}
|
||||
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_x8_f
|
||||
_neon_static_x8_f:
|
||||
#else
|
||||
.globl neon_static_x8_f
|
||||
neon_static_x8_f:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #5
|
||||
neon_x8_loop:
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q10,q11}, [r5, :128]
|
||||
adds r11, r11, #1
|
||||
vmul.f32 q12, q15, q2
|
||||
vmul.f32 q8, q14, q3
|
||||
vmul.f32 q13, q14, q2
|
||||
vmul.f32 q9, q10, q3
|
||||
vmul.f32 q1, q10, q2
|
||||
vmul.f32 q0, q11, q2
|
||||
vmul.f32 q14, q11, q3
|
||||
vmul.f32 q15, q15, q3
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q10, q12, q8
|
||||
vadd.f32 q11, q0, q9
|
||||
vadd.f32 q8, q15, q13
|
||||
vld1.32 {q12,q13}, [r4, :128]
|
||||
vsub.f32 q9, q1, q14
|
||||
vsub.f32 q15, q11, q10
|
||||
vsub.f32 q14, q9, q8
|
||||
vadd.f32 q4, q12, q15 @
|
||||
vsub.f32 q6, q12, q15 @
|
||||
vsub.f32 q5, q13, q14 @
|
||||
vadd.f32 q7, q13, q14 @
|
||||
vld1.32 {q14,q15}, [r9, :128]
|
||||
vld1.32 {q12,q13}, [r7, :128]
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q0, q14, q3
|
||||
vst1.32 {q4,q5}, [r4, :128]
|
||||
vmul.f32 q14, q15, q3
|
||||
vmul.f32 q4, q15, q2
|
||||
vadd.f32 q15, q9, q8
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vmul.f32 q8, q12, q3
|
||||
vmul.f32 q5, q13, q3
|
||||
vmul.f32 q12, q12, q2
|
||||
vmul.f32 q9, q13, q2
|
||||
vadd.f32 q14, q14, q1
|
||||
vsub.f32 q13, q4, q0
|
||||
vadd.f32 q0, q9, q8
|
||||
vld1.32 {q8,q9}, [r3, :128]
|
||||
vadd.f32 q1, q11, q10
|
||||
vsub.f32 q12, q12, q5
|
||||
vadd.f32 q11, q8, q15
|
||||
vsub.f32 q8, q8, q15
|
||||
vadd.f32 q2, q12, q14
|
||||
vsub.f32 q10, q0, q13
|
||||
vadd.f32 q15, q0, q13
|
||||
vadd.f32 q13, q9, q1
|
||||
vsub.f32 q9, q9, q1
|
||||
vsub.f32 q12, q12, q14
|
||||
vadd.f32 q0, q11, q2
|
||||
vadd.f32 q1, q13, q15
|
||||
vsub.f32 q4, q11, q2
|
||||
vadd.f32 q2, q8, q10 @
|
||||
vsub.f32 q3, q9, q12 @
|
||||
vst1.32 {q0,q1}, [r3, :128]!
|
||||
vsub.f32 q5, q13, q15
|
||||
vld1.32 {q14,q15}, [r10, :128]
|
||||
vadd.f32 q7, q9, q12 @
|
||||
vld1.32 {q12,q13}, [r8, :128]
|
||||
vst1.32 {q2,q3}, [r5, :128]!
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q6, q8, q10 @
|
||||
vmul.f32 q8, q14, q2
|
||||
vst1.32 {q4,q5}, [r7, :128]!
|
||||
vmul.f32 q10, q15, q3
|
||||
vmul.f32 q9, q13, q3
|
||||
vmul.f32 q11, q12, q2
|
||||
vmul.f32 q14, q14, q3
|
||||
vst1.32 {q6,q7}, [r9, :128]!
|
||||
vmul.f32 q15, q15, q2
|
||||
vmul.f32 q12, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vadd.f32 q10, q10, q8
|
||||
vsub.f32 q11, q11, q9
|
||||
vld1.32 {q8,q9}, [r4, :128]
|
||||
vsub.f32 q14, q15, q14
|
||||
vadd.f32 q15, q13, q12
|
||||
vadd.f32 q13, q11, q10
|
||||
vadd.f32 q12, q15, q14
|
||||
vsub.f32 q15, q15, q14
|
||||
vsub.f32 q14, q11, q10
|
||||
vld1.32 {q10,q11}, [r6, :128]
|
||||
vadd.f32 q0, q8, q13
|
||||
vadd.f32 q1, q9, q12
|
||||
vadd.f32 q2, q10, q15 @
|
||||
vsub.f32 q3, q11, q14 @
|
||||
vsub.f32 q4, q8, q13
|
||||
vst1.32 {q0,q1}, [r4, :128]!
|
||||
vsub.f32 q5, q9, q12
|
||||
vsub.f32 q6, q10, q15 @
|
||||
vst1.32 {q2,q3}, [r6, :128]!
|
||||
vadd.f32 q7, q11, q14 @
|
||||
vst1.32 {q4,q5}, [r8, :128]!
|
||||
vst1.32 {q6,q7}, [r10, :128]!
|
||||
bne neon_x8_loop
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_x8_t_f
|
||||
_neon_static_x8_t_f:
|
||||
#else
|
||||
.globl neon_static_x8_t_f
|
||||
neon_static_x8_t_f:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #5
|
||||
neon_x8_t_loop:
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q10,q11}, [r5, :128]
|
||||
adds r11, r11, #1
|
||||
vmul.f32 q12, q15, q2
|
||||
vmul.f32 q8, q14, q3
|
||||
vmul.f32 q13, q14, q2
|
||||
vmul.f32 q9, q10, q3
|
||||
vmul.f32 q1, q10, q2
|
||||
vmul.f32 q0, q11, q2
|
||||
vmul.f32 q14, q11, q3
|
||||
vmul.f32 q15, q15, q3
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q10, q12, q8
|
||||
vadd.f32 q11, q0, q9
|
||||
vadd.f32 q8, q15, q13
|
||||
vld1.32 {q12,q13}, [r4, :128]
|
||||
vsub.f32 q9, q1, q14
|
||||
vsub.f32 q15, q11, q10
|
||||
vsub.f32 q14, q9, q8
|
||||
vadd.f32 q4, q12, q15 @
|
||||
vsub.f32 q6, q12, q15 @
|
||||
vsub.f32 q5, q13, q14 @
|
||||
vadd.f32 q7, q13, q14 @
|
||||
vld1.32 {q14,q15}, [r9, :128]
|
||||
vld1.32 {q12,q13}, [r7, :128]
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q0, q14, q3
|
||||
vst1.32 {q4,q5}, [r4, :128]
|
||||
vmul.f32 q14, q15, q3
|
||||
vmul.f32 q4, q15, q2
|
||||
vadd.f32 q15, q9, q8
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vmul.f32 q8, q12, q3
|
||||
vmul.f32 q5, q13, q3
|
||||
vmul.f32 q12, q12, q2
|
||||
vmul.f32 q9, q13, q2
|
||||
vadd.f32 q14, q14, q1
|
||||
vsub.f32 q13, q4, q0
|
||||
vadd.f32 q0, q9, q8
|
||||
vld1.32 {q8,q9}, [r3, :128]
|
||||
vadd.f32 q1, q11, q10
|
||||
vsub.f32 q12, q12, q5
|
||||
vadd.f32 q11, q8, q15
|
||||
vsub.f32 q8, q8, q15
|
||||
vadd.f32 q2, q12, q14
|
||||
vsub.f32 q10, q0, q13
|
||||
vadd.f32 q15, q0, q13
|
||||
vadd.f32 q13, q9, q1
|
||||
vsub.f32 q9, q9, q1
|
||||
vsub.f32 q12, q12, q14
|
||||
vadd.f32 q0, q11, q2
|
||||
vadd.f32 q1, q13, q15
|
||||
vsub.f32 q4, q11, q2
|
||||
vadd.f32 q2, q8, q10 @
|
||||
vsub.f32 q3, q9, q12 @
|
||||
vst2.32 {q0,q1}, [r3, :128]!
|
||||
vsub.f32 q5, q13, q15
|
||||
vld1.32 {q14,q15}, [r10, :128]
|
||||
vadd.f32 q7, q9, q12 @
|
||||
vld1.32 {q12,q13}, [r8, :128]
|
||||
vst2.32 {q2,q3}, [r5, :128]!
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q6, q8, q10 @
|
||||
vmul.f32 q8, q14, q2
|
||||
vst2.32 {q4,q5}, [r7, :128]!
|
||||
vmul.f32 q10, q15, q3
|
||||
vmul.f32 q9, q13, q3
|
||||
vmul.f32 q11, q12, q2
|
||||
vmul.f32 q14, q14, q3
|
||||
vst2.32 {q6,q7}, [r9, :128]!
|
||||
vmul.f32 q15, q15, q2
|
||||
vmul.f32 q12, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vadd.f32 q10, q10, q8
|
||||
vsub.f32 q11, q11, q9
|
||||
vld1.32 {q8,q9}, [r4, :128]
|
||||
vsub.f32 q14, q15, q14
|
||||
vadd.f32 q15, q13, q12
|
||||
vadd.f32 q13, q11, q10
|
||||
vadd.f32 q12, q15, q14
|
||||
vsub.f32 q15, q15, q14
|
||||
vsub.f32 q14, q11, q10
|
||||
vld1.32 {q10,q11}, [r6, :128]
|
||||
vadd.f32 q0, q8, q13
|
||||
vadd.f32 q1, q9, q12
|
||||
vadd.f32 q2, q10, q15 @
|
||||
vsub.f32 q3, q11, q14 @
|
||||
vsub.f32 q4, q8, q13
|
||||
vst2.32 {q0,q1}, [r4, :128]!
|
||||
vsub.f32 q5, q9, q12
|
||||
vsub.f32 q6, q10, q15 @
|
||||
vst2.32 {q2,q3}, [r6, :128]!
|
||||
vadd.f32 q7, q11, q14 @
|
||||
vst2.32 {q4,q5}, [r8, :128]!
|
||||
vst2.32 {q6,q7}, [r10, :128]!
|
||||
bne neon_x8_t_loop
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
|
||||
955
3rdparty/ffts/ffts-master/src/neon_static_i.s
vendored
Normal file
955
3rdparty/ffts/ffts-master/src/neon_static_i.s
vendored
Normal file
@@ -0,0 +1,955 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_e_i
|
||||
_neon_static_e_i:
|
||||
#else
|
||||
.globl neon_static_e_i
|
||||
neon_static_e_i:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
ldr lr, [r0, #40] @ this is p->N
|
||||
add r3, r1, #0
|
||||
add r7, r1, lr
|
||||
add r5, r7, lr
|
||||
add r10, r5, lr
|
||||
add r4, r10, lr
|
||||
add r8, r4, lr
|
||||
add r6, r8, lr
|
||||
add r9, r6, lr
|
||||
ldr r12, [r0]
|
||||
add r1, r0, #0
|
||||
add r0, r2, #0
|
||||
ldr r2, [r1, #16] @ this is p->ee_ws
|
||||
ldr r11, [r1, #28] @ this is p->i0
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_loop:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vadd.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vsub.f32 d31, d5, d2 @
|
||||
vsub.f32 d28, d4, d3 @
|
||||
vadd.f32 d30, d4, d3 @
|
||||
vadd.f32 d5, d19, d14 @
|
||||
vadd.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vsub.f32 d6, d30, d27 @
|
||||
vsub.f32 d4, d18, d15 @
|
||||
vsub.f32 d13, d19, d14 @
|
||||
vadd.f32 d12, d18, d15 @
|
||||
vsub.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vadd.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_loop
|
||||
|
||||
ldr r11, [r1, #12]
|
||||
vld2.32 {q9}, [r5, :128]! @tag2
|
||||
vld2.32 {q13}, [r3, :128]! @tag0
|
||||
vld2.32 {q12}, [r4, :128]! @tag1
|
||||
vld2.32 {q0}, [r7, :128]! @tag4
|
||||
vsub.f32 q11, q13, q12
|
||||
vld2.32 {q8}, [r6, :128]! @tag3
|
||||
vadd.f32 q12, q13, q12
|
||||
vsub.f32 q10, q9, q8
|
||||
vadd.f32 q8, q9, q8
|
||||
vadd.f32 q9, q12, q8
|
||||
vadd.f32 d9, d23, d20 @
|
||||
vsub.f32 d11, d23, d20 @
|
||||
vsub.f32 q8, q12, q8
|
||||
vsub.f32 d8, d22, d21 @
|
||||
vadd.f32 d10, d22, d21 @
|
||||
ldr r2, [r12], #4
|
||||
vld1.32 {d20, d21}, [r11, :128]
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q9, q4
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q8, q5
|
||||
add lr, r0, lr, lsl #2
|
||||
vswp d9,d10
|
||||
vst1.32 {d8,d9,d10,d11}, [lr, :128]!
|
||||
vld2.32 {q13}, [r10, :128]! @tag7
|
||||
vld2.32 {q15}, [r9, :128]! @tag6
|
||||
vld2.32 {q11}, [r8, :128]! @tag5
|
||||
vsub.f32 q14, q15, q13
|
||||
vsub.f32 q12, q0, q11
|
||||
vadd.f32 q11, q0, q11
|
||||
vadd.f32 q13, q15, q13
|
||||
vadd.f32 d13, d29, d24 @
|
||||
vadd.f32 q15, q13, q11
|
||||
vsub.f32 d12, d28, d25 @
|
||||
vsub.f32 d15, d29, d24 @
|
||||
vadd.f32 d14, d28, d25 @
|
||||
vtrn.32 q15, q6
|
||||
vsub.f32 q15, q13, q11
|
||||
vtrn.32 q15, q7
|
||||
vswp d13, d14
|
||||
vst1.32 {d12,d13,d14,d15}, [lr, :128]!
|
||||
vtrn.32 q13, q14
|
||||
vtrn.32 q11, q12
|
||||
vmul.f32 d24, d26, d21
|
||||
vmul.f32 d28, d27, d20
|
||||
vmul.f32 d25, d26, d20
|
||||
vmul.f32 d26, d27, d21
|
||||
vmul.f32 d27, d22, d21
|
||||
vmul.f32 d30, d23, d20
|
||||
vmul.f32 d29, d23, d21
|
||||
vmul.f32 d22, d22, d20
|
||||
vsub.f32 d21, d28, d24
|
||||
vadd.f32 d20, d26, d25
|
||||
vadd.f32 d25, d30, d27
|
||||
vsub.f32 d24, d22, d29
|
||||
vadd.f32 q11, q12, q10
|
||||
vsub.f32 q10, q12, q10
|
||||
vadd.f32 q0, q9, q11
|
||||
vsub.f32 q2, q9, q11
|
||||
vadd.f32 d3, d17, d20 @
|
||||
vsub.f32 d7, d17, d20 @
|
||||
vsub.f32 d2, d16, d21 @
|
||||
vadd.f32 d6, d16, d21 @
|
||||
vswp d1, d2
|
||||
vswp d5, d6
|
||||
vstmia r2!, {q0-q3}
|
||||
|
||||
add r2, r7, #0
|
||||
add r7, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r8, #0
|
||||
add r8, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_oo_loop_exit
|
||||
_neon_oo_loop:
|
||||
vld2.32 {q8}, [r6, :128]!
|
||||
vld2.32 {q9}, [r5, :128]!
|
||||
vld2.32 {q10}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vadd.f32 q11, q9, q8
|
||||
vsub.f32 q8, q9, q8
|
||||
vsub.f32 q9, q13, q10
|
||||
vadd.f32 q12, q13, q10
|
||||
subs r11, r11, #1
|
||||
vld2.32 {q10}, [r7, :128]!
|
||||
vld2.32 {q13}, [r9, :128]!
|
||||
vsub.f32 q2, q12, q11
|
||||
vsub.f32 d7, d19, d16 @
|
||||
vadd.f32 d3, d19, d16 @
|
||||
vadd.f32 d6, d18, d17 @
|
||||
vsub.f32 d2, d18, d17 @
|
||||
vld2.32 {q9}, [r8, :128]!
|
||||
vld2.32 {q8}, [r10, :128]!
|
||||
vadd.f32 q0, q12, q11
|
||||
vadd.f32 q11, q13, q8
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 q8, q13, q8
|
||||
vsub.f32 q9, q10, q9
|
||||
vsub.f32 q6, q12, q11
|
||||
vadd.f32 q4, q12, q11
|
||||
vtrn.32 q0, q2
|
||||
ldr r2, [r12], #4
|
||||
vsub.f32 d15, d19, d16 @
|
||||
ldr lr, [r12], #4
|
||||
vadd.f32 d11, d19, d16 @
|
||||
vadd.f32 d14, d18, d17 @
|
||||
vsub.f32 d10, d18, d17 @
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q1, q3
|
||||
add lr, r0, lr, lsl #2
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_oo_loop
|
||||
_neon_oo_loop_exit:
|
||||
|
||||
add r2, r3, #0
|
||||
add r3, r7, #0
|
||||
add r7, r2, #0
|
||||
add r2, r4, #0
|
||||
add r4, r8, #0
|
||||
add r8, r2, #0
|
||||
add r2, r5, #0
|
||||
add r5, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r6, #0
|
||||
add r6, r10, #0
|
||||
add r10, r2, #0
|
||||
add r2, r9, #0
|
||||
add r9, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r2, [r1, #16]
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_ee_loop2_exit
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_loop2:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vadd.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vsub.f32 d31, d5, d2 @
|
||||
vsub.f32 d28, d4, d3 @
|
||||
vadd.f32 d30, d4, d3 @
|
||||
vadd.f32 d5, d19, d14 @
|
||||
vadd.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vsub.f32 d6, d30, d27 @
|
||||
vsub.f32 d4, d18, d15 @
|
||||
vsub.f32 d13, d19, d14 @
|
||||
vadd.f32 d12, d18, d15 @
|
||||
vsub.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vadd.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_loop2
|
||||
_neon_ee_loop2_exit:
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_o_i
|
||||
_neon_static_o_i:
|
||||
#else
|
||||
.globl neon_static_o_i
|
||||
neon_static_o_i:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
ldr lr, [r0, #40] @ this is p->N
|
||||
add r3, r1, #0
|
||||
add r7, r1, lr
|
||||
add r5, r7, lr
|
||||
add r10, r5, lr
|
||||
add r4, r10, lr
|
||||
add r8, r4, lr
|
||||
add r6, r8, lr
|
||||
add r9, r6, lr
|
||||
ldr r12, [r0]
|
||||
add r1, r0, #0
|
||||
add r0, r2, #0
|
||||
ldr r2, [r1, #16] @ this is p->ee_ws
|
||||
ldr r11, [r1, #28] @ this is p->i0
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_o_loop:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vadd.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vsub.f32 d31, d5, d2 @
|
||||
vsub.f32 d28, d4, d3 @
|
||||
vadd.f32 d30, d4, d3 @
|
||||
vadd.f32 d5, d19, d14 @
|
||||
vadd.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vsub.f32 d6, d30, d27 @
|
||||
vsub.f32 d4, d18, d15 @
|
||||
vsub.f32 d13, d19, d14 @
|
||||
vadd.f32 d12, d18, d15 @
|
||||
vsub.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vadd.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_o_loop
|
||||
|
||||
add r2, r7, #0
|
||||
add r7, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r8, #0
|
||||
add r8, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_oo_o_loop_exit
|
||||
_neon_oo_o_loop:
|
||||
vld2.32 {q8}, [r6, :128]!
|
||||
vld2.32 {q9}, [r5, :128]!
|
||||
vld2.32 {q10}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vadd.f32 q11, q9, q8
|
||||
vsub.f32 q8, q9, q8
|
||||
vsub.f32 q9, q13, q10
|
||||
vadd.f32 q12, q13, q10
|
||||
subs r11, r11, #1
|
||||
vld2.32 {q10}, [r7, :128]!
|
||||
vld2.32 {q13}, [r9, :128]!
|
||||
vsub.f32 q2, q12, q11
|
||||
vsub.f32 d7, d19, d16 @
|
||||
vadd.f32 d3, d19, d16 @
|
||||
vadd.f32 d6, d18, d17 @
|
||||
vsub.f32 d2, d18, d17 @
|
||||
vld2.32 {q9}, [r8, :128]!
|
||||
vld2.32 {q8}, [r10, :128]!
|
||||
vadd.f32 q0, q12, q11
|
||||
vadd.f32 q11, q13, q8
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 q8, q13, q8
|
||||
vsub.f32 q9, q10, q9
|
||||
vsub.f32 q6, q12, q11
|
||||
vadd.f32 q4, q12, q11
|
||||
vtrn.32 q0, q2
|
||||
ldr r2, [r12], #4
|
||||
vsub.f32 d15, d19, d16 @
|
||||
ldr lr, [r12], #4
|
||||
vadd.f32 d11, d19, d16 @
|
||||
vadd.f32 d14, d18, d17 @
|
||||
vsub.f32 d10, d18, d17 @
|
||||
add r2, r0, r2, lsl #2
|
||||
vtrn.32 q1, q3
|
||||
add lr, r0, lr, lsl #2
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_oo_o_loop
|
||||
_neon_oo_o_loop_exit:
|
||||
|
||||
ldr r11, [r1, #8]
|
||||
vld1.32 {q8}, [r5, :128]!
|
||||
vld1.32 {q10}, [r6, :128]!
|
||||
vld2.32 {q11}, [r4, :128]!
|
||||
vld2.32 {q13}, [r3, :128]!
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vorr d25, d17, d17
|
||||
vorr d24, d20, d20
|
||||
vorr d20, d16, d16
|
||||
vsub.f32 q9, q13, q11
|
||||
vadd.f32 q11, q13, q11
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 d24, d25
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 d20, d21
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q8, q10, q12
|
||||
add lr, r0, lr, lsl #2
|
||||
vadd.f32 q10, q10, q12
|
||||
vadd.f32 q0, q11, q10
|
||||
vadd.f32 d25, d19, d16 @
|
||||
vsub.f32 d27, d19, d16 @
|
||||
vsub.f32 q1, q11, q10
|
||||
vsub.f32 d24, d18, d17 @
|
||||
vadd.f32 d26, d18, d17 @
|
||||
vtrn.32 q0, q12
|
||||
vtrn.32 q1, q13
|
||||
vld1.32 {d24, d25}, [r11, :128]
|
||||
vswp d1, d2
|
||||
vst1.32 {q0, q1}, [r2, :128]!
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
vadd.f32 q1, q0, q15
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vsub.f32 q15, q0, q15
|
||||
vsub.f32 q0, q14, q13
|
||||
vadd.f32 q3, q14, q13
|
||||
vadd.f32 q2, q3, q1
|
||||
vadd.f32 d29, d1, d30 @
|
||||
vsub.f32 d27, d1, d30 @
|
||||
vsub.f32 q3, q3, q1
|
||||
vsub.f32 d28, d0, d31 @
|
||||
vadd.f32 d26, d0, d31 @
|
||||
vtrn.32 q2, q14
|
||||
vtrn.32 q3, q13
|
||||
vswp d5, d6
|
||||
vst1.32 {q2, q3}, [r2, :128]!
|
||||
vtrn.32 q11, q9
|
||||
vtrn.32 q10, q8
|
||||
vmul.f32 d20, d18, d25
|
||||
vmul.f32 d22, d19, d24
|
||||
vmul.f32 d21, d19, d25
|
||||
vmul.f32 d18, d18, d24
|
||||
vmul.f32 d19, d16, d25
|
||||
vmul.f32 d30, d17, d24
|
||||
vmul.f32 d23, d16, d24
|
||||
vmul.f32 d24, d17, d25
|
||||
vadd.f32 d17, d22, d20
|
||||
vsub.f32 d16, d18, d21
|
||||
vsub.f32 d21, d30, d19
|
||||
vadd.f32 d20, d24, d23
|
||||
vadd.f32 q9, q8, q10
|
||||
vsub.f32 q8, q8, q10
|
||||
vadd.f32 q4, q14, q9
|
||||
vsub.f32 q6, q14, q9
|
||||
vadd.f32 d11, d27, d16 @
|
||||
vsub.f32 d15, d27, d16 @
|
||||
vsub.f32 d10, d26, d17 @
|
||||
vadd.f32 d14, d26, d17 @
|
||||
vswp d9, d10
|
||||
vswp d13, d14
|
||||
vstmia lr!, {q4-q7}
|
||||
|
||||
|
||||
add r2, r3, #0
|
||||
add r3, r7, #0
|
||||
add r7, r2, #0
|
||||
add r2, r4, #0
|
||||
add r4, r8, #0
|
||||
add r8, r2, #0
|
||||
add r2, r5, #0
|
||||
add r5, r9, #0
|
||||
add r9, r2, #0
|
||||
add r2, r6, #0
|
||||
add r6, r10, #0
|
||||
add r10, r2, #0
|
||||
add r2, r9, #0
|
||||
add r9, r10, #0
|
||||
add r10, r2, #0
|
||||
ldr r2, [r1, #16]
|
||||
ldr r11, [r1, #32] @ this is p->i1
|
||||
cmp r11, #0
|
||||
beq _neon_ee_o_loop2_exit
|
||||
|
||||
vld1.32 {d16, d17}, [r2, :128]
|
||||
_neon_ee_o_loop2:
|
||||
vld2.32 {q15}, [r10, :128]!
|
||||
vld2.32 {q13}, [r8, :128]!
|
||||
vld2.32 {q14}, [r7, :128]!
|
||||
vld2.32 {q9}, [r4, :128]!
|
||||
vld2.32 {q10}, [r3, :128]!
|
||||
vld2.32 {q11}, [r6, :128]!
|
||||
vld2.32 {q12}, [r5, :128]!
|
||||
vsub.f32 q1, q14, q13
|
||||
vld2.32 {q0}, [r9, :128]!
|
||||
subs r11, r11, #1
|
||||
vsub.f32 q2, q0, q15
|
||||
vadd.f32 q0, q0, q15
|
||||
vmul.f32 d10, d2, d17
|
||||
vmul.f32 d11, d3, d16
|
||||
vmul.f32 d12, d3, d17
|
||||
vmul.f32 d6, d4, d17
|
||||
vmul.f32 d7, d5, d16
|
||||
vmul.f32 d8, d4, d16
|
||||
vmul.f32 d9, d5, d17
|
||||
vmul.f32 d13, d2, d16
|
||||
vsub.f32 d7, d7, d6
|
||||
vadd.f32 d11, d11, d10
|
||||
vsub.f32 q1, q12, q11
|
||||
vsub.f32 q2, q10, q9
|
||||
vadd.f32 d6, d9, d8
|
||||
vadd.f32 q4, q14, q13
|
||||
vadd.f32 q11, q12, q11
|
||||
vadd.f32 q12, q10, q9
|
||||
vsub.f32 d10, d13, d12
|
||||
vsub.f32 q7, q4, q0
|
||||
vsub.f32 q9, q12, q11
|
||||
vsub.f32 q13, q5, q3
|
||||
vadd.f32 d29, d5, d2 @
|
||||
vadd.f32 q5, q5, q3
|
||||
vadd.f32 q10, q4, q0
|
||||
vadd.f32 q11, q12, q11
|
||||
vsub.f32 d31, d5, d2 @
|
||||
vsub.f32 d28, d4, d3 @
|
||||
vadd.f32 d30, d4, d3 @
|
||||
vadd.f32 d5, d19, d14 @
|
||||
vadd.f32 d7, d31, d26 @
|
||||
vadd.f32 q1, q14, q5
|
||||
vadd.f32 q0, q11, q10
|
||||
vsub.f32 d6, d30, d27 @
|
||||
vsub.f32 d4, d18, d15 @
|
||||
vsub.f32 d13, d19, d14 @
|
||||
vadd.f32 d12, d18, d15 @
|
||||
vsub.f32 d15, d31, d26 @
|
||||
ldr r2, [r12], #4
|
||||
vtrn.32 q1, q3
|
||||
ldr lr, [r12], #4
|
||||
vtrn.32 q0, q2
|
||||
add r2, r0, r2, lsl #2
|
||||
vsub.f32 q4, q11, q10
|
||||
add lr, r0, lr, lsl #2
|
||||
vsub.f32 q5, q14, q5
|
||||
vadd.f32 d14, d30, d27 @
|
||||
vst2.32 {q0,q1}, [r2, :128]!
|
||||
vst2.32 {q2,q3}, [lr, :128]!
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vst2.32 {q4,q5}, [r2, :128]!
|
||||
vst2.32 {q6,q7}, [lr, :128]!
|
||||
bne _neon_ee_o_loop2
|
||||
_neon_ee_o_loop2_exit:
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_x4_i
|
||||
_neon_static_x4_i:
|
||||
#else
|
||||
.globl neon_static_x4_i
|
||||
neon_static_x4_i:
|
||||
#endif
|
||||
@ add r3, r0, #0
|
||||
push {r4, r5, r6, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
|
||||
vld1.32 {q8,q9}, [r0, :128]
|
||||
add r4, r0, r1, lsl #1
|
||||
vld1.32 {q10,q11}, [r4, :128]
|
||||
add r5, r0, r1, lsl #2
|
||||
vld1.32 {q12,q13}, [r5, :128]
|
||||
add r6, r4, r1, lsl #2
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q2,q3}, [r2, :128]
|
||||
|
||||
vmul.f32 q0, q13, q3
|
||||
vmul.f32 q5, q12, q2
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q4, q14, q3
|
||||
vmul.f32 q14, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vmul.f32 q12, q15, q3
|
||||
vmul.f32 q2, q15, q2
|
||||
vsub.f32 q0, q5, q0
|
||||
vadd.f32 q13, q13, q14
|
||||
vadd.f32 q12, q12, q1
|
||||
vsub.f32 q1, q2, q4
|
||||
vadd.f32 q15, q0, q12
|
||||
vsub.f32 q12, q0, q12
|
||||
vadd.f32 q14, q13, q1
|
||||
vsub.f32 q13, q13, q1
|
||||
vadd.f32 q0, q8, q15
|
||||
vadd.f32 q1, q9, q14
|
||||
vsub.f32 q2, q10, q13 @
|
||||
vsub.f32 q4, q8, q15
|
||||
vadd.f32 q3, q11, q12 @
|
||||
vst1.32 {q0,q1}, [r0, :128]
|
||||
vsub.f32 q5, q9, q14
|
||||
vadd.f32 q6, q10, q13 @
|
||||
vsub.f32 q7, q11, q12 @
|
||||
vst1.32 {q2,q3}, [r4, :128]
|
||||
vst1.32 {q4,q5}, [r5, :128]
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, pc}
|
||||
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_x8_i
|
||||
_neon_static_x8_i:
|
||||
#else
|
||||
.globl neon_static_x8_i
|
||||
neon_static_x8_i:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #5
|
||||
neon_x8_loop:
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q10,q11}, [r5, :128]
|
||||
adds r11, r11, #1
|
||||
vmul.f32 q12, q15, q2
|
||||
vmul.f32 q8, q14, q3
|
||||
vmul.f32 q13, q14, q2
|
||||
vmul.f32 q9, q10, q3
|
||||
vmul.f32 q1, q10, q2
|
||||
vmul.f32 q0, q11, q2
|
||||
vmul.f32 q14, q11, q3
|
||||
vmul.f32 q15, q15, q3
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q10, q12, q8
|
||||
vadd.f32 q11, q0, q9
|
||||
vadd.f32 q8, q15, q13
|
||||
vld1.32 {q12,q13}, [r4, :128]
|
||||
vsub.f32 q9, q1, q14
|
||||
vsub.f32 q15, q11, q10
|
||||
vsub.f32 q14, q9, q8
|
||||
vsub.f32 q4, q12, q15 @
|
||||
vadd.f32 q6, q12, q15 @
|
||||
vadd.f32 q5, q13, q14 @
|
||||
vsub.f32 q7, q13, q14 @
|
||||
vld1.32 {q14,q15}, [r9, :128]
|
||||
vld1.32 {q12,q13}, [r7, :128]
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q0, q14, q3
|
||||
vst1.32 {q4,q5}, [r4, :128]
|
||||
vmul.f32 q14, q15, q3
|
||||
vmul.f32 q4, q15, q2
|
||||
vadd.f32 q15, q9, q8
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vmul.f32 q8, q12, q3
|
||||
vmul.f32 q5, q13, q3
|
||||
vmul.f32 q12, q12, q2
|
||||
vmul.f32 q9, q13, q2
|
||||
vadd.f32 q14, q14, q1
|
||||
vsub.f32 q13, q4, q0
|
||||
vadd.f32 q0, q9, q8
|
||||
vld1.32 {q8,q9}, [r3, :128]
|
||||
vadd.f32 q1, q11, q10
|
||||
vsub.f32 q12, q12, q5
|
||||
vadd.f32 q11, q8, q15
|
||||
vsub.f32 q8, q8, q15
|
||||
vadd.f32 q2, q12, q14
|
||||
vsub.f32 q10, q0, q13
|
||||
vadd.f32 q15, q0, q13
|
||||
vadd.f32 q13, q9, q1
|
||||
vsub.f32 q9, q9, q1
|
||||
vsub.f32 q12, q12, q14
|
||||
vadd.f32 q0, q11, q2
|
||||
vadd.f32 q1, q13, q15
|
||||
vsub.f32 q4, q11, q2
|
||||
vsub.f32 q2, q8, q10 @
|
||||
vadd.f32 q3, q9, q12 @
|
||||
vst1.32 {q0,q1}, [r3, :128]!
|
||||
vsub.f32 q5, q13, q15
|
||||
vld1.32 {q14,q15}, [r10, :128]
|
||||
vsub.f32 q7, q9, q12 @
|
||||
vld1.32 {q12,q13}, [r8, :128]
|
||||
vst1.32 {q2,q3}, [r5, :128]!
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vadd.f32 q6, q8, q10 @
|
||||
vmul.f32 q8, q14, q2
|
||||
vst1.32 {q4,q5}, [r7, :128]!
|
||||
vmul.f32 q10, q15, q3
|
||||
vmul.f32 q9, q13, q3
|
||||
vmul.f32 q11, q12, q2
|
||||
vmul.f32 q14, q14, q3
|
||||
vst1.32 {q6,q7}, [r9, :128]!
|
||||
vmul.f32 q15, q15, q2
|
||||
vmul.f32 q12, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vadd.f32 q10, q10, q8
|
||||
vsub.f32 q11, q11, q9
|
||||
vld1.32 {q8,q9}, [r4, :128]
|
||||
vsub.f32 q14, q15, q14
|
||||
vadd.f32 q15, q13, q12
|
||||
vadd.f32 q13, q11, q10
|
||||
vadd.f32 q12, q15, q14
|
||||
vsub.f32 q15, q15, q14
|
||||
vsub.f32 q14, q11, q10
|
||||
vld1.32 {q10,q11}, [r6, :128]
|
||||
vadd.f32 q0, q8, q13
|
||||
vadd.f32 q1, q9, q12
|
||||
vsub.f32 q2, q10, q15 @
|
||||
vadd.f32 q3, q11, q14 @
|
||||
vsub.f32 q4, q8, q13
|
||||
vst1.32 {q0,q1}, [r4, :128]!
|
||||
vsub.f32 q5, q9, q12
|
||||
vadd.f32 q6, q10, q15 @
|
||||
vst1.32 {q2,q3}, [r6, :128]!
|
||||
vsub.f32 q7, q11, q14 @
|
||||
vst1.32 {q4,q5}, [r8, :128]!
|
||||
vst1.32 {q6,q7}, [r10, :128]!
|
||||
bne neon_x8_loop
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _neon_static_x8_t_i
|
||||
_neon_static_x8_t_i:
|
||||
#else
|
||||
.globl neon_static_x8_t_i
|
||||
neon_static_x8_t_i:
|
||||
#endif
|
||||
push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
||||
vstmdb sp!, {d8-d15}
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #5
|
||||
neon_x8_t_loop:
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vld1.32 {q14,q15}, [r6, :128]
|
||||
vld1.32 {q10,q11}, [r5, :128]
|
||||
adds r11, r11, #1
|
||||
vmul.f32 q12, q15, q2
|
||||
vmul.f32 q8, q14, q3
|
||||
vmul.f32 q13, q14, q2
|
||||
vmul.f32 q9, q10, q3
|
||||
vmul.f32 q1, q10, q2
|
||||
vmul.f32 q0, q11, q2
|
||||
vmul.f32 q14, q11, q3
|
||||
vmul.f32 q15, q15, q3
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vsub.f32 q10, q12, q8
|
||||
vadd.f32 q11, q0, q9
|
||||
vadd.f32 q8, q15, q13
|
||||
vld1.32 {q12,q13}, [r4, :128]
|
||||
vsub.f32 q9, q1, q14
|
||||
vsub.f32 q15, q11, q10
|
||||
vsub.f32 q14, q9, q8
|
||||
vsub.f32 q4, q12, q15 @
|
||||
vadd.f32 q6, q12, q15 @
|
||||
vadd.f32 q5, q13, q14 @
|
||||
vsub.f32 q7, q13, q14 @
|
||||
vld1.32 {q14,q15}, [r9, :128]
|
||||
vld1.32 {q12,q13}, [r7, :128]
|
||||
vmul.f32 q1, q14, q2
|
||||
vmul.f32 q0, q14, q3
|
||||
vst1.32 {q4,q5}, [r4, :128]
|
||||
vmul.f32 q14, q15, q3
|
||||
vmul.f32 q4, q15, q2
|
||||
vadd.f32 q15, q9, q8
|
||||
vst1.32 {q6,q7}, [r6, :128]
|
||||
vmul.f32 q8, q12, q3
|
||||
vmul.f32 q5, q13, q3
|
||||
vmul.f32 q12, q12, q2
|
||||
vmul.f32 q9, q13, q2
|
||||
vadd.f32 q14, q14, q1
|
||||
vsub.f32 q13, q4, q0
|
||||
vadd.f32 q0, q9, q8
|
||||
vld1.32 {q8,q9}, [r3, :128]
|
||||
vadd.f32 q1, q11, q10
|
||||
vsub.f32 q12, q12, q5
|
||||
vadd.f32 q11, q8, q15
|
||||
vsub.f32 q8, q8, q15
|
||||
vadd.f32 q2, q12, q14
|
||||
vsub.f32 q10, q0, q13
|
||||
vadd.f32 q15, q0, q13
|
||||
vadd.f32 q13, q9, q1
|
||||
vsub.f32 q9, q9, q1
|
||||
vsub.f32 q12, q12, q14
|
||||
vadd.f32 q0, q11, q2
|
||||
vadd.f32 q1, q13, q15
|
||||
vsub.f32 q4, q11, q2
|
||||
vsub.f32 q2, q8, q10 @
|
||||
vadd.f32 q3, q9, q12 @
|
||||
vst2.32 {q0,q1}, [r3, :128]!
|
||||
vsub.f32 q5, q13, q15
|
||||
vld1.32 {q14,q15}, [r10, :128]
|
||||
vsub.f32 q7, q9, q12 @
|
||||
vld1.32 {q12,q13}, [r8, :128]
|
||||
vst2.32 {q2,q3}, [r5, :128]!
|
||||
vld1.32 {q2,q3}, [r12, :128]!
|
||||
vadd.f32 q6, q8, q10 @
|
||||
vmul.f32 q8, q14, q2
|
||||
vst2.32 {q4,q5}, [r7, :128]!
|
||||
vmul.f32 q10, q15, q3
|
||||
vmul.f32 q9, q13, q3
|
||||
vmul.f32 q11, q12, q2
|
||||
vmul.f32 q14, q14, q3
|
||||
vst2.32 {q6,q7}, [r9, :128]!
|
||||
vmul.f32 q15, q15, q2
|
||||
vmul.f32 q12, q12, q3
|
||||
vmul.f32 q13, q13, q2
|
||||
vadd.f32 q10, q10, q8
|
||||
vsub.f32 q11, q11, q9
|
||||
vld1.32 {q8,q9}, [r4, :128]
|
||||
vsub.f32 q14, q15, q14
|
||||
vadd.f32 q15, q13, q12
|
||||
vadd.f32 q13, q11, q10
|
||||
vadd.f32 q12, q15, q14
|
||||
vsub.f32 q15, q15, q14
|
||||
vsub.f32 q14, q11, q10
|
||||
vld1.32 {q10,q11}, [r6, :128]
|
||||
vadd.f32 q0, q8, q13
|
||||
vadd.f32 q1, q9, q12
|
||||
vsub.f32 q2, q10, q15 @
|
||||
vadd.f32 q3, q11, q14 @
|
||||
vsub.f32 q4, q8, q13
|
||||
vst2.32 {q0,q1}, [r4, :128]!
|
||||
vsub.f32 q5, q9, q12
|
||||
vadd.f32 q6, q10, q15 @
|
||||
vst2.32 {q2,q3}, [r6, :128]!
|
||||
vsub.f32 q7, q11, q14 @
|
||||
vst2.32 {q4,q5}, [r8, :128]!
|
||||
vst2.32 {q6,q7}, [r10, :128]!
|
||||
bne neon_x8_t_loop
|
||||
|
||||
vldmia sp!, {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
||||
|
||||
|
||||
209
3rdparty/ffts/ffts-master/src/patterns.c
vendored
Normal file
209
3rdparty/ffts/ffts-master/src/patterns.c
vendored
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "patterns.h"
|
||||
|
||||
void permute_addr(int N, int offset, int stride, int *d) {
|
||||
int i, a[4] = {0,2,1,3};
|
||||
for(i=0;i<4;i++) {
|
||||
d[i] = offset + (a[i] << stride);
|
||||
if(d[i] < 0) d[i] += N;
|
||||
}
|
||||
}
|
||||
|
||||
void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
|
||||
|
||||
if(N > 4) {
|
||||
ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);
|
||||
if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
|
||||
if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
|
||||
else {
|
||||
int temp = poffset+(1<<stride);
|
||||
if(temp < 0) temp += bigN;
|
||||
temp *= 2;
|
||||
|
||||
if(!(temp % (VL*2))) {
|
||||
(*is)[0] = poffset+(1<<stride);
|
||||
(*is)[1] = poffset+(1<<stride)+(1<<(stride+2));
|
||||
(*is)[2] = poffset-(1<<stride);
|
||||
(*is)[3] = poffset-(1<<stride)+(1<<(stride+2));
|
||||
int i;
|
||||
for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN;
|
||||
for(i=0;i<4;i++) (*is)[i] *= 2;
|
||||
*is += 4;
|
||||
}
|
||||
}
|
||||
}else if(N == 4) {
|
||||
int perm[4];
|
||||
permute_addr(bigN, poffset, stride, perm);
|
||||
if(!((perm[0]*2) % (VL*2))) {
|
||||
int i;
|
||||
for(i=0;i<4;i++) {
|
||||
(*is)[i] = perm[i] * 2;
|
||||
}
|
||||
*is += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) {
|
||||
int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
|
||||
int stride = log(N/leafN)/log(2);
|
||||
|
||||
p->is = malloc(N/VL * sizeof(ptrdiff_t));
|
||||
|
||||
ptrdiff_t *is = p->is;
|
||||
|
||||
if((N/leafN) % 3 > 1) i1++;
|
||||
|
||||
for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
|
||||
for(i=i0;i<i0+i1;i++) {
|
||||
ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
|
||||
ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
|
||||
}
|
||||
for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
|
||||
|
||||
|
||||
//for(i=0;i<N/VL;i++) {
|
||||
// printf("%td ", p->is[i]);
|
||||
// if(i % 16 == 15) printf("\n");
|
||||
//}
|
||||
|
||||
p->i0 = i0; p->i1 = i1;
|
||||
}
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
|
||||
if((even && N == leafN) || (!even && N <= leafN)) {
|
||||
offsets[2*(ooffset/leafN)] = ioffset*2;
|
||||
offsets[2*(ooffset/leafN)+1] = ooffset;
|
||||
}else if(N > 4) {
|
||||
ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
|
||||
ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
|
||||
if(N/4 >= leafN)
|
||||
ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int compare_offsets(const void *a, const void *b) {
|
||||
return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
|
||||
}
|
||||
|
||||
uint32_t reverse_bits(uint32_t a, int n) {
|
||||
uint32_t x = 0;
|
||||
|
||||
int i;
|
||||
for(i=0;i<n;i++) {
|
||||
if(a & (1 << i)) x |= 1 << (n-i-1);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
|
||||
|
||||
ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
|
||||
|
||||
ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
|
||||
|
||||
size_t i;
|
||||
for(i=0;i<2*N/leafN;i+=2) {
|
||||
if(offsets[i] < 0) offsets[i] = N + offsets[i];
|
||||
}
|
||||
|
||||
qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets);
|
||||
//elaborate_is(p, N, 0, 0, 1);
|
||||
p->offsets = malloc(N/leafN * sizeof(ptrdiff_t));
|
||||
for(i=0;i<N/leafN;i++) {
|
||||
p->offsets[i] = offsets[i*2+1]*2;
|
||||
}
|
||||
//for(i=0;i<N/leafN;i++) {
|
||||
// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
|
||||
//}
|
||||
free(offsets);
|
||||
}
|
||||
|
||||
/*
|
||||
int tree_count(int N, int leafN, int offset) {
|
||||
|
||||
if(N <= leafN) return 0;
|
||||
int count = 0;
|
||||
count += tree_count(N/4, leafN, offset);
|
||||
count += tree_count(N/8, leafN, offset + N/4);
|
||||
count += tree_count(N/8, leafN, offset + N/4 + N/8);
|
||||
count += tree_count(N/4, leafN, offset + N/2);
|
||||
count += tree_count(N/4, leafN, offset + 3*N/4);
|
||||
|
||||
return 1 + count;
|
||||
}
|
||||
|
||||
void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
|
||||
|
||||
if(N <= leafN) return;
|
||||
elaborate_tree(p, N/4, leafN, offset);
|
||||
elaborate_tree(p, N/8, leafN, offset + N/4);
|
||||
elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
|
||||
elaborate_tree(p, N/4, leafN, offset + N/2);
|
||||
elaborate_tree(p, N/4, leafN, offset + 3*N/4);
|
||||
|
||||
(*p)[0] = N;
|
||||
(*p)[1] = offset*2;
|
||||
|
||||
(*p)+=2;
|
||||
}
|
||||
|
||||
void ffts_init_tree(ffts_plan_t *p, int N, int leafN) {
|
||||
|
||||
int count = tree_count(N, leafN, 0) + 1;
|
||||
transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
|
||||
|
||||
//printf("count = %d\n", count);
|
||||
|
||||
elaborate_tree(&ps, N, leafN, 0);
|
||||
#ifdef __ARM_NEON__
|
||||
ps -= 2;
|
||||
#endif
|
||||
ps[0] = 0;
|
||||
ps[1] = 0;
|
||||
//int i;
|
||||
//for(i=0;i<count;i++) {
|
||||
// fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1],
|
||||
// __builtin_ctzl(p->transforms[i*2]) - 5);
|
||||
//}
|
||||
|
||||
}
|
||||
*/
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
45
3rdparty/ffts/ffts-master/src/patterns.h
vendored
Normal file
45
3rdparty/ffts/ffts-master/src/patterns.h
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __PATTERNS_H__
|
||||
#define __PATTERNS_H__
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL);
|
||||
void ffts_init_offsets(ffts_plan_t *p, int N, int leafN);
|
||||
//void ffts_init_tree(ffts_plan_t *p, int N, int leafN);
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
878
3rdparty/ffts/ffts-master/src/sse.s
vendored
Normal file
878
3rdparty/ffts/ffts-master/src/sse.s
vendored
Normal file
@@ -0,0 +1,878 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
.globl _neon_x4
|
||||
.align 4
|
||||
_neon_x4:
|
||||
|
||||
.globl _neon_x8
|
||||
.align 4
|
||||
_neon_x8:
|
||||
|
||||
.globl _neon_x8_t
|
||||
.align 4
|
||||
_neon_x8_t:
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _leaf_ee_init
|
||||
_leaf_ee_init:
|
||||
#else
|
||||
.globl leaf_ee_init
|
||||
leaf_ee_init:
|
||||
#endif
|
||||
#lea L_sse_constants(%rip), %r9
|
||||
movq 0xe0(%rdi), %r9
|
||||
xorl %eax, %eax
|
||||
# eax is loop counter (init to 0)
|
||||
# rcx is loop max count
|
||||
# rsi is 'in' base pointer
|
||||
# rdx is 'out' base pointer
|
||||
# r8 is offsets pointer
|
||||
# r9 is constants pointer
|
||||
# scratch: rax r11 r12
|
||||
# .align 4, 0x90
|
||||
|
||||
# _leaf_ee + 9 needs 16 byte alignment
|
||||
#ifdef __APPLE__
|
||||
.globl _leaf_ee
|
||||
_leaf_ee:
|
||||
#else
|
||||
.globl leaf_ee
|
||||
leaf_ee:
|
||||
#endif
|
||||
movaps 32(%r9), %xmm0 #83.5
|
||||
movaps (%r9), %xmm8 #83.5
|
||||
LEAF_EE_1:
|
||||
LEAF_EE_const_0:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
|
||||
LEAF_EE_const_2:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
|
||||
movaps %xmm7, %xmm6 #83.5
|
||||
LEAF_EE_const_3:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
|
||||
movaps %xmm12, %xmm11 #83.5
|
||||
subps %xmm10, %xmm12 #83.5
|
||||
addps %xmm10, %xmm11 #83.5
|
||||
xorps %xmm8, %xmm12 #83.5
|
||||
LEAF_EE_const_1:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
|
||||
LEAF_EE_const_4:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
|
||||
addps %xmm9, %xmm6 #83.5
|
||||
subps %xmm9, %xmm7 #83.5
|
||||
LEAF_EE_const_5:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
|
||||
movaps %xmm10, %xmm9 #83.5
|
||||
LEAF_EE_const_6:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
|
||||
movaps %xmm6, %xmm5 #83.5
|
||||
LEAF_EE_const_7:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
|
||||
movaps %xmm3, %xmm15 #83.5
|
||||
shufps $177, %xmm12, %xmm12 #83.5
|
||||
movaps %xmm7, %xmm4 #83.5
|
||||
movslq (%r8, %rax, 4), %r11 #83.44
|
||||
subps %xmm13, %xmm10 #83.5
|
||||
subps %xmm14, %xmm3 #83.5
|
||||
addps %xmm11, %xmm5 #83.5
|
||||
subps %xmm11, %xmm6 #83.5
|
||||
subps %xmm12, %xmm4 #83.5
|
||||
addps %xmm12, %xmm7 #83.5
|
||||
addps %xmm13, %xmm9 #83.5
|
||||
addps %xmm14, %xmm15 #83.5
|
||||
movaps 16(%r9), %xmm12 #83.5
|
||||
movaps %xmm9, %xmm1 #83.5
|
||||
movaps 16(%r9), %xmm11 #83.5
|
||||
movaps %xmm5, %xmm2 #83.5
|
||||
mulps %xmm10, %xmm12 #83.5
|
||||
subps %xmm15, %xmm9 #83.5
|
||||
addps %xmm15, %xmm1 #83.5
|
||||
mulps %xmm3, %xmm11 #83.5
|
||||
addps %xmm1, %xmm2 #83.5
|
||||
subps %xmm1, %xmm5 #83.5
|
||||
shufps $177, %xmm10, %xmm10 #83.5
|
||||
xorps %xmm8, %xmm9 #83.5
|
||||
shufps $177, %xmm3, %xmm3 #83.5
|
||||
movaps %xmm6, %xmm1 #83.5
|
||||
mulps %xmm0, %xmm10 #83.5
|
||||
movaps %xmm4, %xmm13 #83.5
|
||||
mulps %xmm0, %xmm3 #83.5
|
||||
subps %xmm10, %xmm12 #83.5
|
||||
addps %xmm3, %xmm11 #83.5
|
||||
movaps %xmm12, %xmm3 #83.5
|
||||
movaps %xmm7, %xmm14 #83.5
|
||||
shufps $177, %xmm9, %xmm9 #83.5
|
||||
subps %xmm11, %xmm12 #83.5
|
||||
addps %xmm11, %xmm3 #83.5
|
||||
subps %xmm9, %xmm1 #83.5
|
||||
addps %xmm9, %xmm6 #83.5
|
||||
addps %xmm3, %xmm4 #83.5
|
||||
subps %xmm3, %xmm13 #83.5
|
||||
xorps %xmm8, %xmm12 #83.5
|
||||
movaps %xmm2, %xmm3 #83.5
|
||||
shufps $177, %xmm12, %xmm12 #83.5
|
||||
movaps %xmm6, %xmm9 #83.5
|
||||
movslq 8(%r8, %rax, 4), %r12 #83.59
|
||||
movlhps %xmm4, %xmm3 #83.5
|
||||
addq $4, %rax
|
||||
shufps $238, %xmm4, %xmm2 #83.5
|
||||
movaps %xmm1, %xmm4 #83.5
|
||||
#movntdq %xmm3, (%rdx,%r11,4) #83.5
|
||||
subps %xmm12, %xmm7 #83.5
|
||||
addps %xmm12, %xmm14 #83.5
|
||||
movlhps %xmm7, %xmm4 #83.5
|
||||
shufps $238, %xmm7, %xmm1 #83.5
|
||||
movaps %xmm5, %xmm7 #83.5
|
||||
movlhps %xmm13, %xmm7 #83.5
|
||||
movlhps %xmm14, %xmm9 #83.5
|
||||
shufps $238, %xmm13, %xmm5 #83.5
|
||||
shufps $238, %xmm14, %xmm6 #83.5
|
||||
movaps %xmm3, (%rdx,%r11,4) #83.5
|
||||
movaps %xmm4, 16(%rdx,%r11,4) #83.5
|
||||
movaps %xmm7, 32(%rdx,%r11,4) #83.5
|
||||
movaps %xmm9, 48(%rdx,%r11,4) #83.5
|
||||
movaps %xmm2, (%rdx,%r12,4) #83.5
|
||||
movaps %xmm1, 16(%rdx,%r12,4) #83.5
|
||||
movaps %xmm5, 32(%rdx,%r12,4) #83.5
|
||||
movaps %xmm6, 48(%rdx,%r12,4) #83.5
|
||||
cmpq %rcx, %rax
|
||||
jne LEAF_EE_1
|
||||
|
||||
|
||||
|
||||
# _leaf_oo + 4 needs to be 16 byte aligned
|
||||
#ifdef __APPLE__
|
||||
.globl _leaf_oo
|
||||
_leaf_oo:
|
||||
#else
|
||||
.globl leaf_oo
|
||||
leaf_oo:
|
||||
#endif
|
||||
movaps (%r9), %xmm5 #92.7
|
||||
LEAF_OO_1:
|
||||
LEAF_OO_const_0:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
|
||||
movaps %xmm4, %xmm6 #93.5
|
||||
LEAF_OO_const_1:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
|
||||
LEAF_OO_const_2:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
|
||||
addps %xmm7, %xmm6 #93.5
|
||||
subps %xmm7, %xmm4 #93.5
|
||||
LEAF_OO_const_3:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
|
||||
movaps %xmm10, %xmm9 #93.5
|
||||
LEAF_OO_const_4:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
|
||||
movaps %xmm6, %xmm3 #93.5
|
||||
LEAF_OO_const_5:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
|
||||
movaps %xmm1, %xmm2 #93.5
|
||||
LEAF_OO_const_6:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
|
||||
movaps %xmm4, %xmm15 #93.5
|
||||
LEAF_OO_const_7:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
|
||||
movaps %xmm14, %xmm13 #93.5
|
||||
movslq (%r8, %rax, 4), %r11 #83.44
|
||||
subps %xmm8, %xmm10 #93.5
|
||||
addps %xmm8, %xmm9 #93.5
|
||||
addps %xmm11, %xmm2 #93.5
|
||||
subps %xmm12, %xmm14 #93.5
|
||||
subps %xmm11, %xmm1 #93.5
|
||||
addps %xmm12, %xmm13 #93.5
|
||||
addps %xmm9, %xmm3 #93.5
|
||||
subps %xmm9, %xmm6 #93.5
|
||||
xorps %xmm5, %xmm10 #93.5
|
||||
xorps %xmm5, %xmm14 #93.5
|
||||
shufps $177, %xmm10, %xmm10 #93.5
|
||||
movaps %xmm2, %xmm9 #93.5
|
||||
shufps $177, %xmm14, %xmm14 #93.5
|
||||
movaps %xmm6, %xmm7 #93.5
|
||||
movslq 8(%r8, %rax, 4), %r12 #83.59
|
||||
addq $4, %rax #92.18
|
||||
addps %xmm10, %xmm4 #93.5
|
||||
addps %xmm13, %xmm9 #93.5
|
||||
subps %xmm13, %xmm2 #93.5
|
||||
subps %xmm10, %xmm15 #93.5
|
||||
movaps %xmm1, %xmm13 #93.5
|
||||
movaps %xmm2, %xmm8 #93.5
|
||||
movlhps %xmm4, %xmm7 #93.5
|
||||
subps %xmm14, %xmm13 #93.5
|
||||
addps %xmm14, %xmm1 #93.5
|
||||
shufps $238, %xmm4, %xmm6 #93.5
|
||||
movaps %xmm3, %xmm14 #93.5
|
||||
movaps %xmm9, %xmm4 #93.5
|
||||
movlhps %xmm15, %xmm14 #93.5
|
||||
movlhps %xmm13, %xmm4 #93.5
|
||||
movlhps %xmm1, %xmm8 #93.5
|
||||
shufps $238, %xmm15, %xmm3 #93.5
|
||||
shufps $238, %xmm13, %xmm9 #93.5
|
||||
shufps $238, %xmm1, %xmm2 #93.5
|
||||
movaps %xmm14, (%rdx,%r11,4) #93.5
|
||||
movaps %xmm7, 16(%rdx,%r11,4) #93.5
|
||||
movaps %xmm4, 32(%rdx,%r11,4) #93.5
|
||||
movaps %xmm8, 48(%rdx,%r11,4) #93.5
|
||||
movaps %xmm3, (%rdx,%r12,4) #93.5
|
||||
movaps %xmm6, 16(%rdx,%r12,4) #93.5
|
||||
movaps %xmm9, 32(%rdx,%r12,4) #93.5
|
||||
movaps %xmm2, 48(%rdx,%r12,4) #93.5
|
||||
cmpq %rcx, %rax
|
||||
jne LEAF_OO_1 # Prob 95% #92.14
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _leaf_eo
|
||||
_leaf_eo:
|
||||
#else
|
||||
.globl leaf_eo
|
||||
leaf_eo:
|
||||
#endif
|
||||
LEAF_EO_const_0:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
|
||||
LEAF_EO_const_2:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
|
||||
movaps %xmm9, %xmm11 #88.5
|
||||
LEAF_EO_const_3:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
|
||||
movaps %xmm7, %xmm6 #88.5
|
||||
LEAF_EO_const_1:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
|
||||
subps %xmm5, %xmm7 #88.5
|
||||
addps %xmm4, %xmm11 #88.5
|
||||
subps %xmm4, %xmm9 #88.5
|
||||
addps %xmm5, %xmm6 #88.5
|
||||
movaps (%r9), %xmm3 #88.5
|
||||
movaps %xmm11, %xmm10 #88.5
|
||||
xorps %xmm3, %xmm7 #88.5
|
||||
movaps %xmm9, %xmm8 #88.5
|
||||
shufps $177, %xmm7, %xmm7 #88.5
|
||||
addps %xmm6, %xmm10 #88.5
|
||||
subps %xmm6, %xmm11 #88.5
|
||||
subps %xmm7, %xmm8 #88.5
|
||||
addps %xmm7, %xmm9 #88.5
|
||||
movslq 8(%r8, %rax, 4), %r12 #83.59
|
||||
movaps %xmm10, %xmm2 #88.5
|
||||
movslq (%r8, %rax, 4), %r11 #83.44
|
||||
movaps %xmm11, %xmm1 #88.5
|
||||
shufps $238, %xmm8, %xmm10 #88.5
|
||||
shufps $238, %xmm9, %xmm11 #88.5
|
||||
movaps %xmm10, (%rdx,%r12,4) #88.5
|
||||
movaps %xmm11, 16(%rdx,%r12,4) #88.5
|
||||
LEAF_EO_const_4:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
|
||||
LEAF_EO_const_5:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
|
||||
movaps %xmm15, %xmm14 #88.5
|
||||
LEAF_EO_const_6:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
|
||||
addps %xmm12, %xmm14 #88.5
|
||||
subps %xmm12, %xmm15 #88.5
|
||||
LEAF_EO_const_7:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
|
||||
movaps %xmm4, %xmm5 #88.5
|
||||
movaps %xmm14, %xmm7 #88.5
|
||||
addps %xmm13, %xmm5 #88.5
|
||||
subps %xmm13, %xmm4 #88.5
|
||||
movlhps %xmm8, %xmm2 #88.5
|
||||
movaps %xmm5, %xmm8 #88.5
|
||||
movlhps %xmm15, %xmm7 #88.5
|
||||
xorps %xmm3, %xmm15 #88.5
|
||||
movaps %xmm5, %xmm6 #88.5
|
||||
subps %xmm14, %xmm5 #88.5
|
||||
addps %xmm14, %xmm6 #88.5
|
||||
movlhps %xmm9, %xmm1 #88.5
|
||||
movaps %xmm4, %xmm14 #88.5
|
||||
movlhps %xmm4, %xmm8 #88.5
|
||||
movaps %xmm1, %xmm12 #88.5
|
||||
shufps $177, %xmm15, %xmm15 #88.5
|
||||
movaps 0x30(%r9), %xmm11 #88.5
|
||||
addq $4, %rax #90.5
|
||||
subps %xmm15, %xmm14 #88.5
|
||||
mulps %xmm7, %xmm11 #88.5
|
||||
addps %xmm15, %xmm4 #88.5
|
||||
movaps 0x30(%r9), %xmm9 #88.5
|
||||
movaps 0x40(%r9), %xmm15 #88.5
|
||||
shufps $177, %xmm7, %xmm7 #88.5
|
||||
mulps %xmm8, %xmm9 #88.5
|
||||
mulps %xmm15, %xmm7 #88.5
|
||||
shufps $177, %xmm8, %xmm8 #88.5
|
||||
subps %xmm7, %xmm11 #88.5
|
||||
mulps %xmm15, %xmm8 #88.5
|
||||
movaps %xmm11, %xmm10 #88.5
|
||||
addps %xmm8, %xmm9 #88.5
|
||||
shufps $238, %xmm14, %xmm6 #88.5
|
||||
subps %xmm9, %xmm11 #88.5
|
||||
addps %xmm9, %xmm10 #88.5
|
||||
xorps %xmm3, %xmm11 #88.5
|
||||
movaps %xmm2, %xmm3 #88.5
|
||||
shufps $177, %xmm11, %xmm11 #88.5
|
||||
subps %xmm10, %xmm3 #88.5
|
||||
addps %xmm10, %xmm2 #88.5
|
||||
addps %xmm11, %xmm12 #88.5
|
||||
subps %xmm11, %xmm1 #88.5
|
||||
shufps $238, %xmm4, %xmm5 #88.5
|
||||
movaps %xmm5, 48(%rdx,%r12,4) #88.5
|
||||
movaps %xmm6, 32(%rdx,%r12,4) #88.5
|
||||
movaps %xmm2, (%rdx,%r11,4) #88.5
|
||||
movaps %xmm1, 16(%rdx,%r11,4) #88.5
|
||||
movaps %xmm3, 32(%rdx,%r11,4) #88.5
|
||||
movaps %xmm12, 48(%rdx,%r11,4) #88.5
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _leaf_oe
|
||||
_leaf_oe:
|
||||
#else
|
||||
.globl leaf_oe
|
||||
leaf_oe:
|
||||
#endif
|
||||
movaps (%r9), %xmm0 #59.5
|
||||
#movaps 0x20(%r9), %xmm1 #59.5
|
||||
LEAF_OE_const_2:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
|
||||
LEAF_OE_const_3:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
|
||||
movaps %xmm6, %xmm10 #70.5
|
||||
shufps $228, %xmm8, %xmm10 #70.5
|
||||
movaps %xmm10, %xmm9 #70.5
|
||||
shufps $228, %xmm6, %xmm8 #70.5
|
||||
LEAF_OE_const_0:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
|
||||
LEAF_OE_const_1:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
|
||||
movaps %xmm12, %xmm14 #70.5
|
||||
movslq (%r8, %rax, 4), %r11 #83.44
|
||||
addps %xmm8, %xmm9 #70.5
|
||||
subps %xmm8, %xmm10 #70.5
|
||||
addps %xmm7, %xmm14 #70.5
|
||||
subps %xmm7, %xmm12 #70.5
|
||||
movaps %xmm9, %xmm4 #70.5
|
||||
movaps %xmm14, %xmm13 #70.5
|
||||
shufps $238, %xmm10, %xmm4 #70.5
|
||||
xorps %xmm0, %xmm10 #70.5
|
||||
shufps $177, %xmm10, %xmm10 #70.5
|
||||
movaps %xmm12, %xmm11 #70.5
|
||||
movaps %xmm14, %xmm5 #70.5
|
||||
addps %xmm9, %xmm13 #70.5
|
||||
subps %xmm10, %xmm11 #70.5
|
||||
subps %xmm9, %xmm14 #70.5
|
||||
shufps $238, %xmm12, %xmm5 #70.5
|
||||
addps %xmm10, %xmm12 #70.5
|
||||
movslq 8(%r8, %rax, 4), %r12 #83.59
|
||||
movlhps %xmm11, %xmm13 #70.5
|
||||
movaps %xmm13, (%rdx,%r11,4) #70.5
|
||||
movaps 0x30(%r9), %xmm13 #70.5
|
||||
movlhps %xmm12, %xmm14 #70.5
|
||||
movaps 0x40(%r9), %xmm12 #70.5
|
||||
mulps %xmm5, %xmm13 #70.5
|
||||
shufps $177, %xmm5, %xmm5 #70.5
|
||||
mulps %xmm12, %xmm5 #70.5
|
||||
movaps %xmm14, 16(%rdx,%r11,4) #70.5
|
||||
subps %xmm5, %xmm13 #70.5
|
||||
movaps 0x30(%r9), %xmm5 #70.5
|
||||
mulps %xmm4, %xmm5 #70.5
|
||||
shufps $177, %xmm4, %xmm4 #70.5
|
||||
mulps %xmm12, %xmm4 #70.5
|
||||
LEAF_OE_const_4:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
|
||||
addps %xmm4, %xmm5 #70.5
|
||||
LEAF_OE_const_6:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
|
||||
movaps %xmm9, %xmm3 #70.5
|
||||
LEAF_OE_const_7:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
|
||||
movaps %xmm7, %xmm6 #70.5
|
||||
LEAF_OE_const_5:
|
||||
movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
|
||||
movaps %xmm13, %xmm4 #70.5
|
||||
subps %xmm2, %xmm7 #70.5
|
||||
addps %xmm15, %xmm3 #70.5
|
||||
subps %xmm15, %xmm9 #70.5
|
||||
addps %xmm2, %xmm6 #70.5
|
||||
subps %xmm5, %xmm13 #70.5
|
||||
addps %xmm5, %xmm4 #70.5
|
||||
xorps %xmm0, %xmm7 #70.5
|
||||
addq $4, %rax #72.5
|
||||
movaps %xmm3, %xmm2 #70.5
|
||||
shufps $177, %xmm7, %xmm7 #70.5
|
||||
movaps %xmm9, %xmm8 #70.5
|
||||
xorps %xmm0, %xmm13 #70.5
|
||||
addps %xmm6, %xmm2 #70.5
|
||||
subps %xmm7, %xmm8 #70.5
|
||||
subps %xmm6, %xmm3 #70.5
|
||||
addps %xmm7, %xmm9 #70.5
|
||||
movaps %xmm2, %xmm10 #70.5
|
||||
movaps %xmm3, %xmm11 #70.5
|
||||
shufps $238, %xmm8, %xmm2 #70.5
|
||||
shufps $238, %xmm9, %xmm3 #70.5
|
||||
movaps %xmm2, %xmm14 #70.5
|
||||
shufps $177, %xmm13, %xmm13 #70.5
|
||||
subps %xmm4, %xmm14 #70.5
|
||||
addps %xmm4, %xmm2 #70.5
|
||||
movaps %xmm3, %xmm4 #70.5
|
||||
subps %xmm13, %xmm3 #70.5
|
||||
addps %xmm13, %xmm4 #70.5
|
||||
movlhps %xmm8, %xmm10 #70.5
|
||||
movlhps %xmm9, %xmm11 #70.5
|
||||
movaps %xmm10, 32(%rdx,%r11,4) #70.5
|
||||
movaps %xmm11, 48(%rdx,%r11,4) #70.5
|
||||
movaps %xmm2, (%rdx,%r12,4) #70.5
|
||||
movaps %xmm3, 16(%rdx,%r12,4) #70.5
|
||||
movaps %xmm14, 32(%rdx,%r12,4) #70.5
|
||||
movaps %xmm4, 48(%rdx,%r12,4) #70.5
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _leaf_end
|
||||
_leaf_end:
|
||||
#else
|
||||
.globl leaf_end
|
||||
leaf_end:
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _x_init
|
||||
_x_init:
|
||||
#else
|
||||
.globl x_init
|
||||
x_init:
|
||||
#endif
|
||||
#movaps L_sse_constants(%rip), %xmm3 #34.3
|
||||
movaps (%r9), %xmm3 #34.3
|
||||
movq 0x20(%rdi),%r8
|
||||
#ifdef __APPLE__
|
||||
.globl _x4
|
||||
_x4:
|
||||
#else
|
||||
.globl x4
|
||||
x4:
|
||||
#endif
|
||||
movaps 64(%rdx), %xmm0 #34.3
|
||||
movaps 96(%rdx), %xmm1 #34.3
|
||||
movaps (%rdx), %xmm7 #34.3
|
||||
movaps (%r8), %xmm4 #const
|
||||
movaps %xmm7, %xmm9 #34.3
|
||||
movaps %xmm4, %xmm6 #34.3
|
||||
movaps 16(%r8), %xmm2 #const
|
||||
mulps %xmm0, %xmm6 #34.3
|
||||
mulps %xmm1, %xmm4 #34.3
|
||||
shufps $177, %xmm0, %xmm0 #34.3
|
||||
shufps $177, %xmm1, %xmm1 #34.3
|
||||
mulps %xmm2, %xmm0 #34.3
|
||||
mulps %xmm1, %xmm2 #34.3
|
||||
subps %xmm0, %xmm6 #34.3
|
||||
addps %xmm2, %xmm4 #34.3
|
||||
movaps %xmm6, %xmm5 #34.3
|
||||
subps %xmm4, %xmm6 #34.3
|
||||
addps %xmm4, %xmm5 #34.3
|
||||
movaps 32(%rdx), %xmm8 #34.3
|
||||
xorps %xmm3, %xmm6 #34.3
|
||||
shufps $177, %xmm6, %xmm6 #34.3
|
||||
movaps %xmm8, %xmm10 #34.3
|
||||
movaps 112(%rdx), %xmm12 #34.3
|
||||
subps %xmm5, %xmm9 #34.3
|
||||
addps %xmm5, %xmm7 #34.3
|
||||
addps %xmm6, %xmm10 #34.3
|
||||
subps %xmm6, %xmm8 #34.3
|
||||
movaps %xmm7, (%rdx) #34.3
|
||||
movaps %xmm8, 32(%rdx) #34.3
|
||||
movaps %xmm9, 64(%rdx) #34.3
|
||||
movaps %xmm10, 96(%rdx) #34.3
|
||||
movaps 32(%r8), %xmm14 #const #34.3
|
||||
movaps 80(%rdx), %xmm11 #34.3
|
||||
movaps %xmm14, %xmm0 #34.3
|
||||
movaps 48(%r8), %xmm13 #const #34.3
|
||||
mulps %xmm11, %xmm0 #34.3
|
||||
mulps %xmm12, %xmm14 #34.3
|
||||
shufps $177, %xmm11, %xmm11 #34.3
|
||||
shufps $177, %xmm12, %xmm12 #34.3
|
||||
mulps %xmm13, %xmm11 #34.3
|
||||
mulps %xmm12, %xmm13 #34.3
|
||||
subps %xmm11, %xmm0 #34.3
|
||||
addps %xmm13, %xmm14 #34.3
|
||||
movaps %xmm0, %xmm15 #34.3
|
||||
subps %xmm14, %xmm0 #34.3
|
||||
addps %xmm14, %xmm15 #34.3
|
||||
xorps %xmm3, %xmm0 #34.3
|
||||
movaps 16(%rdx), %xmm1 #34.3
|
||||
movaps 48(%rdx), %xmm2 #34.3
|
||||
movaps %xmm1, %xmm4 #34.3
|
||||
shufps $177, %xmm0, %xmm0 #34.3
|
||||
movaps %xmm2, %xmm5 #34.3
|
||||
addps %xmm15, %xmm1 #34.3
|
||||
subps %xmm0, %xmm2 #34.3
|
||||
subps %xmm15, %xmm4 #34.3
|
||||
addps %xmm0, %xmm5 #34.3
|
||||
movaps %xmm1, 16(%rdx) #34.3
|
||||
movaps %xmm2, 48(%rdx) #34.3
|
||||
movaps %xmm4, 80(%rdx) #34.3
|
||||
movaps %xmm5, 112(%rdx) #34.3
|
||||
ret
|
||||
|
||||
# _x8_soft + 5 needs to be 16 byte aligned
|
||||
#ifdef __APPLE__
|
||||
.globl _x8_soft
|
||||
_x8_soft:
|
||||
#else
|
||||
.globl x8_soft
|
||||
x8_soft:
|
||||
#endif
|
||||
xorl %eax, %eax
|
||||
movq %rdx, %rbx
|
||||
movq %r8, %rsi
|
||||
leaq (%rdx,%rcx,4), %r9
|
||||
leaq (%r9,%rcx,4), %r10
|
||||
leaq (%r10,%rcx,4), %r11
|
||||
leaq (%r11,%rcx,4), %r12
|
||||
leaq (%r12,%rcx,4), %r13
|
||||
leaq (%r13,%rcx,4), %r14
|
||||
leaq (%r14,%rcx,4), %r15
|
||||
X8_soft_loop:
|
||||
movaps (%rsi), %xmm9
|
||||
movaps (%r10,%rax,4), %xmm6
|
||||
movaps %xmm9, %xmm11
|
||||
movaps (%r11,%rax,4), %xmm7
|
||||
movaps 16(%rsi), %xmm8
|
||||
mulps %xmm6, %xmm11
|
||||
mulps %xmm7, %xmm9
|
||||
shufps $177, %xmm6, %xmm6
|
||||
mulps %xmm8, %xmm6
|
||||
shufps $177, %xmm7, %xmm7
|
||||
subps %xmm6, %xmm11
|
||||
mulps %xmm7, %xmm8
|
||||
movaps %xmm11, %xmm10
|
||||
addps %xmm8, %xmm9
|
||||
movaps 32(%rsi), %xmm15
|
||||
addps %xmm9, %xmm10
|
||||
subps %xmm9, %xmm11
|
||||
movaps (%rbx,%rax,4), %xmm5
|
||||
movaps %xmm15, %xmm6
|
||||
movaps (%r12,%rax,4), %xmm12
|
||||
movaps %xmm5, %xmm2
|
||||
movaps (%r14,%rax,4), %xmm13
|
||||
xorps %xmm3, %xmm11 #const
|
||||
movaps 48(%rsi), %xmm14
|
||||
subps %xmm10, %xmm2
|
||||
mulps %xmm12, %xmm6
|
||||
addps %xmm10, %xmm5
|
||||
mulps %xmm13, %xmm15
|
||||
movaps 64(%rsi), %xmm10
|
||||
movaps %xmm5, %xmm0
|
||||
shufps $177, %xmm12, %xmm12
|
||||
shufps $177, %xmm13, %xmm13
|
||||
mulps %xmm14, %xmm12
|
||||
mulps %xmm13, %xmm14
|
||||
subps %xmm12, %xmm6
|
||||
addps %xmm14, %xmm15
|
||||
movaps (%r13,%rax,4), %xmm7
|
||||
movaps %xmm10, %xmm13
|
||||
movaps (%r15,%rax,4), %xmm8
|
||||
movaps %xmm6, %xmm12
|
||||
movaps 80(%rsi), %xmm9
|
||||
addq $96, %rsi
|
||||
mulps %xmm7, %xmm13
|
||||
subps %xmm15, %xmm6
|
||||
addps %xmm15, %xmm12
|
||||
mulps %xmm8, %xmm10
|
||||
subps %xmm12, %xmm0
|
||||
addps %xmm12, %xmm5
|
||||
shufps $177, %xmm7, %xmm7
|
||||
xorps %xmm3, %xmm6 #const
|
||||
shufps $177, %xmm8, %xmm8
|
||||
movaps %xmm2, %xmm12
|
||||
mulps %xmm9, %xmm7
|
||||
mulps %xmm8, %xmm9
|
||||
subps %xmm7, %xmm13
|
||||
addps %xmm9, %xmm10
|
||||
movaps (%r9,%rax,4), %xmm4
|
||||
shufps $177, %xmm11, %xmm11
|
||||
movaps %xmm4, %xmm1
|
||||
shufps $177, %xmm6, %xmm6
|
||||
addps %xmm11, %xmm1
|
||||
subps %xmm11, %xmm4
|
||||
addps %xmm6, %xmm12
|
||||
subps %xmm6, %xmm2
|
||||
movaps %xmm13, %xmm11
|
||||
movaps %xmm4, %xmm14
|
||||
movaps %xmm1, %xmm6
|
||||
subps %xmm10, %xmm13
|
||||
addps %xmm10, %xmm11
|
||||
xorps %xmm3, %xmm13 #const
|
||||
addps %xmm11, %xmm4
|
||||
subps %xmm11, %xmm14
|
||||
shufps $177, %xmm13, %xmm13
|
||||
movaps %xmm5, (%rbx,%rax,4)
|
||||
movaps %xmm4, (%r9,%rax,4)
|
||||
movaps %xmm2, (%r10,%rax,4)
|
||||
subps %xmm13, %xmm1
|
||||
addps %xmm13, %xmm6
|
||||
movaps %xmm1, (%r11,%rax,4)
|
||||
movaps %xmm0, (%r12,%rax,4)
|
||||
movaps %xmm14, (%r13,%rax,4)
|
||||
movaps %xmm12, (%r14,%rax,4)
|
||||
movaps %xmm6, (%r15,%rax,4)
|
||||
addq $4, %rax
|
||||
cmpq %rcx, %rax
|
||||
jne X8_soft_loop
|
||||
ret
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _x8_hard
|
||||
_x8_hard:
|
||||
#else
|
||||
.globl x8_hard
|
||||
x8_hard:
|
||||
#endif
|
||||
movaps (%r9), %xmm5
|
||||
X8_loop:
|
||||
movaps (%r8), %xmm9
|
||||
X8_const_2:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm6
|
||||
movaps %xmm9, %xmm11
|
||||
X8_const_3:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm7
|
||||
movaps 16(%r8), %xmm8
|
||||
mulps %xmm6, %xmm11
|
||||
mulps %xmm7, %xmm9
|
||||
shufps $177, %xmm6, %xmm6
|
||||
mulps %xmm8, %xmm6
|
||||
shufps $177, %xmm7, %xmm7
|
||||
subps %xmm6, %xmm11
|
||||
mulps %xmm7, %xmm8
|
||||
movaps %xmm11, %xmm10
|
||||
addps %xmm8, %xmm9
|
||||
movaps 32(%r8), %xmm15
|
||||
addps %xmm9, %xmm10
|
||||
subps %xmm9, %xmm11
|
||||
X8_const_0:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm3
|
||||
movaps %xmm15, %xmm6
|
||||
X8_const_4:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm12
|
||||
movaps %xmm3, %xmm2
|
||||
X8_const_6:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm13
|
||||
xorps %xmm5, %xmm11
|
||||
movaps 48(%r8), %xmm14
|
||||
subps %xmm10, %xmm2
|
||||
mulps %xmm12, %xmm6
|
||||
addps %xmm10, %xmm3
|
||||
mulps %xmm13, %xmm15
|
||||
movaps 64(%r8), %xmm10
|
||||
movaps %xmm3, %xmm0
|
||||
shufps $177, %xmm12, %xmm12
|
||||
shufps $177, %xmm13, %xmm13
|
||||
mulps %xmm14, %xmm12
|
||||
mulps %xmm13, %xmm14
|
||||
subps %xmm12, %xmm6
|
||||
addps %xmm14, %xmm15
|
||||
X8_const_5:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm7
|
||||
movaps %xmm10, %xmm13
|
||||
X8_const_7:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm8
|
||||
movaps %xmm6, %xmm12
|
||||
movaps 80(%r8), %xmm9
|
||||
addq $96, %r8
|
||||
mulps %xmm7, %xmm13
|
||||
subps %xmm15, %xmm6
|
||||
addps %xmm15, %xmm12
|
||||
mulps %xmm8, %xmm10
|
||||
subps %xmm12, %xmm0
|
||||
addps %xmm12, %xmm3
|
||||
shufps $177, %xmm7, %xmm7
|
||||
xorps %xmm5, %xmm6
|
||||
shufps $177, %xmm8, %xmm8
|
||||
movaps %xmm2, %xmm12
|
||||
mulps %xmm9, %xmm7
|
||||
mulps %xmm8, %xmm9
|
||||
subps %xmm7, %xmm13
|
||||
addps %xmm9, %xmm10
|
||||
X8_const_1:
|
||||
movaps 0xFECA(%rdx,%rax,4), %xmm4
|
||||
shufps $177, %xmm11, %xmm11
|
||||
movaps %xmm4, %xmm1
|
||||
shufps $177, %xmm6, %xmm6
|
||||
addps %xmm11, %xmm1
|
||||
subps %xmm11, %xmm4
|
||||
addps %xmm6, %xmm12
|
||||
subps %xmm6, %xmm2
|
||||
movaps %xmm13, %xmm11
|
||||
movaps %xmm4, %xmm14
|
||||
movaps %xmm1, %xmm6
|
||||
subps %xmm10, %xmm13
|
||||
addps %xmm10, %xmm11
|
||||
xorps %xmm5, %xmm13
|
||||
addps %xmm11, %xmm4
|
||||
subps %xmm11, %xmm14
|
||||
shufps $177, %xmm13, %xmm13
|
||||
X8_const1_0:
|
||||
movaps %xmm3, 0xFECA(%rdx,%rax,4)
|
||||
X8_const1_1:
|
||||
movaps %xmm4, 0xFECA(%rdx,%rax,4)
|
||||
X8_const1_2:
|
||||
movaps %xmm2, 0xFECA(%rdx,%rax,4)
|
||||
subps %xmm13, %xmm1
|
||||
addps %xmm13, %xmm6
|
||||
X8_const1_3:
|
||||
movaps %xmm1, 0xFECA(%rdx,%rax,4)
|
||||
X8_const1_4:
|
||||
movaps %xmm0, 0xFECA(%rdx,%rax,4)
|
||||
X8_const1_5:
|
||||
movaps %xmm14, 0xFECA(%rdx,%rax,4)
|
||||
X8_const1_6:
|
||||
movaps %xmm12, 0xFECA(%rdx,%rax,4)
|
||||
X8_const1_7:
|
||||
movaps %xmm6, 0xFECA(%rdx,%rax,4)
|
||||
addq $4, %rax
|
||||
cmpq %rcx, %rax
|
||||
jne X8_loop
|
||||
|
||||
#ifdef __APPLE__
|
||||
.globl _sse_leaf_ee_offsets
|
||||
.globl _sse_leaf_oo_offsets
|
||||
.globl _sse_leaf_eo_offsets
|
||||
.globl _sse_leaf_oe_offsets
|
||||
.align 4
|
||||
_sse_leaf_ee_offsets:
|
||||
.long LEAF_EE_const_0-_leaf_ee+0x4
|
||||
.long LEAF_EE_const_1-_leaf_ee+0x5
|
||||
.long LEAF_EE_const_2-_leaf_ee+0x5
|
||||
.long LEAF_EE_const_3-_leaf_ee+0x5
|
||||
.long LEAF_EE_const_4-_leaf_ee+0x5
|
||||
.long LEAF_EE_const_5-_leaf_ee+0x5
|
||||
.long LEAF_EE_const_6-_leaf_ee+0x4
|
||||
.long LEAF_EE_const_7-_leaf_ee+0x5
|
||||
_sse_leaf_oo_offsets:
|
||||
.long LEAF_OO_const_0-_leaf_oo+0x4
|
||||
.long LEAF_OO_const_1-_leaf_oo+0x4
|
||||
.long LEAF_OO_const_2-_leaf_oo+0x5
|
||||
.long LEAF_OO_const_3-_leaf_oo+0x5
|
||||
.long LEAF_OO_const_4-_leaf_oo+0x4
|
||||
.long LEAF_OO_const_5-_leaf_oo+0x5
|
||||
.long LEAF_OO_const_6-_leaf_oo+0x5
|
||||
.long LEAF_OO_const_7-_leaf_oo+0x5
|
||||
_sse_leaf_eo_offsets:
|
||||
.long LEAF_EO_const_0-_leaf_eo+0x5
|
||||
.long LEAF_EO_const_1-_leaf_eo+0x4
|
||||
.long LEAF_EO_const_2-_leaf_eo+0x4
|
||||
.long LEAF_EO_const_3-_leaf_eo+0x4
|
||||
.long LEAF_EO_const_4-_leaf_eo+0x5
|
||||
.long LEAF_EO_const_5-_leaf_eo+0x5
|
||||
.long LEAF_EO_const_6-_leaf_eo+0x4
|
||||
.long LEAF_EO_const_7-_leaf_eo+0x5
|
||||
_sse_leaf_oe_offsets:
|
||||
.long LEAF_OE_const_0-_leaf_oe+0x5
|
||||
.long LEAF_OE_const_1-_leaf_oe+0x4
|
||||
.long LEAF_OE_const_2-_leaf_oe+0x4
|
||||
.long LEAF_OE_const_3-_leaf_oe+0x5
|
||||
.long LEAF_OE_const_4-_leaf_oe+0x5
|
||||
.long LEAF_OE_const_5-_leaf_oe+0x5
|
||||
.long LEAF_OE_const_6-_leaf_oe+0x4
|
||||
.long LEAF_OE_const_7-_leaf_oe+0x4
|
||||
#else
|
||||
.globl sse_leaf_ee_offsets
|
||||
.globl sse_leaf_oo_offsets
|
||||
.globl sse_leaf_eo_offsets
|
||||
.globl sse_leaf_oe_offsets
|
||||
.align 4
|
||||
sse_leaf_ee_offsets:
|
||||
.long LEAF_EE_const_0-leaf_ee+0x4
|
||||
.long LEAF_EE_const_1-leaf_ee+0x5
|
||||
.long LEAF_EE_const_2-leaf_ee+0x5
|
||||
.long LEAF_EE_const_3-leaf_ee+0x5
|
||||
.long LEAF_EE_const_4-leaf_ee+0x5
|
||||
.long LEAF_EE_const_5-leaf_ee+0x5
|
||||
.long LEAF_EE_const_6-leaf_ee+0x4
|
||||
.long LEAF_EE_const_7-leaf_ee+0x5
|
||||
sse_leaf_oo_offsets:
|
||||
.long LEAF_OO_const_0-leaf_oo+0x4
|
||||
.long LEAF_OO_const_1-leaf_oo+0x4
|
||||
.long LEAF_OO_const_2-leaf_oo+0x5
|
||||
.long LEAF_OO_const_3-leaf_oo+0x5
|
||||
.long LEAF_OO_const_4-leaf_oo+0x4
|
||||
.long LEAF_OO_const_5-leaf_oo+0x5
|
||||
.long LEAF_OO_const_6-leaf_oo+0x5
|
||||
.long LEAF_OO_const_7-leaf_oo+0x5
|
||||
sse_leaf_eo_offsets:
|
||||
.long LEAF_EO_const_0-leaf_eo+0x5
|
||||
.long LEAF_EO_const_1-leaf_eo+0x4
|
||||
.long LEAF_EO_const_2-leaf_eo+0x4
|
||||
.long LEAF_EO_const_3-leaf_eo+0x4
|
||||
.long LEAF_EO_const_4-leaf_eo+0x5
|
||||
.long LEAF_EO_const_5-leaf_eo+0x5
|
||||
.long LEAF_EO_const_6-leaf_eo+0x4
|
||||
.long LEAF_EO_const_7-leaf_eo+0x5
|
||||
sse_leaf_oe_offsets:
|
||||
.long LEAF_OE_const_0-leaf_oe+0x5
|
||||
.long LEAF_OE_const_1-leaf_oe+0x4
|
||||
.long LEAF_OE_const_2-leaf_oe+0x4
|
||||
.long LEAF_OE_const_3-leaf_oe+0x5
|
||||
.long LEAF_OE_const_4-leaf_oe+0x5
|
||||
.long LEAF_OE_const_5-leaf_oe+0x5
|
||||
.long LEAF_OE_const_6-leaf_oe+0x4
|
||||
.long LEAF_OE_const_7-leaf_oe+0x4
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
.data
|
||||
#else
|
||||
.section .data
|
||||
#endif
|
||||
.p2align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _sse_constants
|
||||
_sse_constants:
|
||||
#else
|
||||
.globl sse_constants
|
||||
sse_constants:
|
||||
#endif
|
||||
.long 0x00000000,0x80000000,0x00000000,0x80000000
|
||||
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
|
||||
.long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
|
||||
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
|
||||
.long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
|
||||
#ifdef __APPLE__
|
||||
.globl _sse_constants_inv
|
||||
_sse_constants_inv:
|
||||
#else
|
||||
.globl sse_constants_inv
|
||||
sse_constants_inv:
|
||||
#endif
|
||||
.long 0x80000000,0x00000000,0x80000000,0x00000000
|
||||
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
|
||||
.long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
|
||||
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
|
||||
.long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
|
||||
50
3rdparty/ffts/ffts-master/src/types.h
vendored
Normal file
50
3rdparty/ffts/ffts-master/src/types.h
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __TYPES_H__
|
||||
#define __TYPES_H__
|
||||
|
||||
#define __INLINE static inline __attribute__((always_inline))
|
||||
|
||||
#if defined(complex)
|
||||
typedef complex float cdata_t;
|
||||
#else
|
||||
typedef float cdata_t[2];
|
||||
#endif
|
||||
typedef float data_t;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
46
3rdparty/ffts/ffts-master/src/vfp.h
vendored
Normal file
46
3rdparty/ffts/ffts-master/src/vfp.h
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, 2013 The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __VFP_H__
|
||||
#define __VFP_H__
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
void vfp_e();
|
||||
void vfp_o();
|
||||
void vfp_x4();
|
||||
void vfp_x8();
|
||||
void vfp_end();
|
||||
|
||||
#endif
|
||||
// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
|
||||
473
3rdparty/ffts/ffts-master/src/vfp.s
vendored
Normal file
473
3rdparty/ffts/ffts-master/src/vfp.s
vendored
Normal file
@@ -0,0 +1,473 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2012, 2013 Anthony M. Blake <amb@anthonix.com>
|
||||
Copyright (c) 2012, 2013 The University of Waikato
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
@ assumes r0 = out
|
||||
@ r1 = in ?
|
||||
@
|
||||
@ r12 = offsets
|
||||
@ r3-r10 = data pointers
|
||||
@ r11 = loop iterations
|
||||
@ r2 = const pointer
|
||||
@ & lr = temps
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _vfp_e
|
||||
_vfp_e:
|
||||
#else
|
||||
.globl vfp_e
|
||||
vfp_e:
|
||||
#endif
|
||||
_vfp_e_loop:
|
||||
vldr s15, [r2, #8]
|
||||
vldr s2, [r3] @ x0
|
||||
vldr s0, [r3, #4]
|
||||
vldr s4, [r4] @ x1
|
||||
vldr s11, [r2]
|
||||
vldr s10, [r7] @ x4
|
||||
vldr s3, [r7, #4]
|
||||
vldr s8, [r8] @ x5
|
||||
vldr s1, [r8, #4]
|
||||
vldr s14, [r9] @ x6
|
||||
vldr s9, [r9, #4]
|
||||
vldr s6, [r10] @ x7
|
||||
vldr s12, [r10, #4]
|
||||
vsub.f32 s18, s3, s1
|
||||
vsub.f32 s7, s10, s8
|
||||
vsub.f32 s5, s14, s6
|
||||
vadd.f32 s6, s14, s6
|
||||
vldr s24, [r5, #4]
|
||||
vsub.f32 s14, s9, s12
|
||||
vldr s22, [r6, #4]
|
||||
vadd.f32 s8, s10, s8
|
||||
vldr s28, [r6] @ x3
|
||||
vldr s17, [r5] @ x2
|
||||
vadd.f32 s10, s9, s12
|
||||
vmul.f32 s13, s18, s15
|
||||
vmul.f32 s9, s7, s11
|
||||
vmul.f32 s16, s5, s11
|
||||
vmul.f32 s18, s18, s11
|
||||
vmul.f32 s30, s14, s11
|
||||
vldr s11, [r4, #4]
|
||||
add r3, r3, #8
|
||||
add r4, r4, #8
|
||||
add r5, r5, #8
|
||||
add r6, r6, #8
|
||||
add r7, r7, #8
|
||||
add r8, r8, #8
|
||||
add r9, r9, #8
|
||||
add r10, r10, #8
|
||||
vmul.f32 s12, s5, s15
|
||||
vmul.f32 s20, s14, s15
|
||||
vadd.f32 s5, s2, s4
|
||||
vadd.f32 s3, s3, s1
|
||||
vmul.f32 s15, s7, s15
|
||||
vadd.f32 s1, s24, s22
|
||||
vsub.f32 s7, s24, s22
|
||||
vadd.f32 s24, s17, s28
|
||||
vadd.f32 s26, s0, s11
|
||||
vsub.f32 s14, s9, s13
|
||||
vsub.f32 s2, s2, s4
|
||||
vadd.f32 s4, s16, s20
|
||||
vsub.f32 s22, s0, s11
|
||||
vsub.f32 s16, s17, s28
|
||||
vadd.f32 s9, s5, s24
|
||||
vadd.f32 s28, s18, s15
|
||||
vadd.f32 s13, s8, s6
|
||||
vsub.f32 s5, s5, s24
|
||||
vsub.f32 s24, s8, s6
|
||||
vadd.f32 s11, s26, s1
|
||||
vsub.f32 s12, s30, s12
|
||||
vadd.f32 s20, s3, s10
|
||||
vsub.f32 s15, s3, s10
|
||||
vsub.f32 s3, s26, s1
|
||||
vadd.f32 s18, s9, s13
|
||||
vadd.f32 s10, s14, s4
|
||||
vadd.f32 s6, s2, s7 @
|
||||
vsub.f32 s0, s2, s7 @
|
||||
vadd.f32 s26, s11, s20
|
||||
vsub.f32 s4, s14, s4
|
||||
vsub.f32 s8, s22, s16 @
|
||||
vadd.f32 s1, s28, s12
|
||||
ldr lr, [r12], #4
|
||||
add lr, r0, lr, lsl #2
|
||||
subs r11, r11, #1
|
||||
vstr s18, [lr]
|
||||
vsub.f32 s2, s28, s12
|
||||
vadd.f32 s12, s22, s16 @
|
||||
vsub.f32 s16, s3, s24 @
|
||||
vsub.f32 s13, s9, s13
|
||||
vstr s26, [lr, #4]
|
||||
vadd.f32 s28, s5, s15 @
|
||||
vsub.f32 s7, s5, s15 @
|
||||
vadd.f32 s14, s6, s10
|
||||
vadd.f32 s5, s8, s1
|
||||
vadd.f32 s9, s0, s2 @
|
||||
vsub.f32 s2, s0, s2 @
|
||||
vsub.f32 s11, s11, s20
|
||||
vstr s28, [lr, #16]
|
||||
vadd.f32 s3, s3, s24 @
|
||||
vstr s16, [lr, #20]
|
||||
vsub.f32 s6, s6, s10
|
||||
vstr s13, [lr, #32]
|
||||
vsub.f32 s13, s12, s4 @
|
||||
vsub.f32 s8, s8, s1
|
||||
vadd.f32 s0, s12, s4 @
|
||||
vstr s11, [lr, #36]
|
||||
vstr s7, [lr, #48]
|
||||
vstr s3, [lr, #52]
|
||||
vstr s14, [lr, #8]
|
||||
vstr s5, [lr, #12]
|
||||
vstr s9, [lr, #24]
|
||||
vstr s13, [lr, #28]
|
||||
vstr s6, [lr, #40]
|
||||
vstr s8, [lr, #44]
|
||||
vstr s2, [lr, #56]
|
||||
vstr s0, [lr, #60]
|
||||
bne _vfp_e_loop
|
||||
|
||||
@ assumes r0 = out
|
||||
@ r1 = in ?
|
||||
@
|
||||
@ r12 = offsets
|
||||
@ r3-r10 = data pointers
|
||||
@ r11 = loop iterations
|
||||
@ r2 & lr = temps
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _vfp_o
|
||||
_vfp_o:
|
||||
#else
|
||||
.globl vfp_o
|
||||
vfp_o:
|
||||
#endif
|
||||
_vfp_o_loop:
|
||||
vldr s4, [r3] @ x0
|
||||
vldr s0, [r3, #4]
|
||||
vldr s6, [r4] @ x1
|
||||
vldr s5, [r4, #4]
|
||||
vldr s7, [r5] @ x2
|
||||
vldr s1, [r5, #4]
|
||||
vldr s3, [r6] @ x3
|
||||
vldr s8, [r6, #4]
|
||||
subs r11, r11, #1
|
||||
ldr r2, [r12], #4
|
||||
add r2, r0, r2, lsl #2
|
||||
vadd.f32 s2, s4, s6
|
||||
vadd.f32 s14, s0, s5
|
||||
vadd.f32 s10, s1, s8
|
||||
vsub.f32 s4, s4, s6
|
||||
vsub.f32 s0, s0, s5
|
||||
vadd.f32 s12, s7, s3
|
||||
vsub.f32 s6, s7, s3
|
||||
vsub.f32 s8, s1, s8
|
||||
vadd.f32 s5, s14, s10
|
||||
vsub.f32 s10, s14, s10
|
||||
vadd.f32 s7, s2, s12
|
||||
vsub.f32 s1, s0, s6 @
|
||||
vsub.f32 s12, s2, s12
|
||||
vadd.f32 s3, s4, s8 @
|
||||
vsub.f32 s2, s4, s8 @
|
||||
vadd.f32 s0, s0, s6 @
|
||||
vstr s7, [r2]
|
||||
vldr s7, [r9] @ x2
|
||||
vstr s5, [r2, #4]
|
||||
vstr s3, [r2, #8]
|
||||
vstr s1, [r2, #12]
|
||||
vstr s12, [r2, #16]
|
||||
vstr s10, [r2, #20]
|
||||
vstr s2, [r2, #24]
|
||||
vstr s0, [r2, #28]
|
||||
vldr s4, [r7] @ x0
|
||||
vldr s0, [r7, #4]
|
||||
vldr s6, [r8] @ x1
|
||||
vldr s5, [r8, #4]
|
||||
vldr s3, [r10] @ x3
|
||||
vldr s8, [r10, #4]
|
||||
vldr s1, [r9, #4]
|
||||
add r3, r3, #8
|
||||
add r4, r4, #8
|
||||
add r5, r5, #8
|
||||
add r6, r6, #8
|
||||
add r7, r7, #8
|
||||
add r8, r8, #8
|
||||
add r9, r9, #8
|
||||
add r10, r10, #8
|
||||
vadd.f32 s2, s4, s6
|
||||
vadd.f32 s14, s0, s5
|
||||
vadd.f32 s10, s1, s8
|
||||
vsub.f32 s4, s4, s6
|
||||
vsub.f32 s0, s0, s5
|
||||
vadd.f32 s12, s7, s3
|
||||
vsub.f32 s6, s7, s3
|
||||
vsub.f32 s8, s1, s8
|
||||
vadd.f32 s5, s14, s10
|
||||
vsub.f32 s10, s14, s10
|
||||
vadd.f32 s7, s2, s12
|
||||
vsub.f32 s1, s0, s6 @
|
||||
vsub.f32 s12, s2, s12
|
||||
vadd.f32 s3, s4, s8 @
|
||||
vsub.f32 s2, s4, s8 @
|
||||
vadd.f32 s0, s0, s6 @
|
||||
vstr s7, [r2, #32]
|
||||
vstr s5, [r2, #36]
|
||||
vstr s3, [r2, #40]
|
||||
vstr s1, [r2, #44]
|
||||
vstr s12, [r2, #48]
|
||||
vstr s10, [r2, #52]
|
||||
vstr s2, [r2, #56]
|
||||
vstr s0, [r2, #60]
|
||||
bne _vfp_o_loop
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _vfp_x4
|
||||
_vfp_x4:
|
||||
#else
|
||||
.globl vfp_x4
|
||||
vfp_x4:
|
||||
#endif
|
||||
add r3, r0, #0
|
||||
add r7, r2, #0
|
||||
add r4, r0, r1, lsl #1
|
||||
add r5, r0, r1, lsl #2
|
||||
add r6, r4, r1, lsl #2
|
||||
mov r11, #4
|
||||
_vfp_x4_loop:
|
||||
|
||||
vldr s8, [r3, #0]
|
||||
vldr s9, [r3, #4]
|
||||
vldr s10, [r4, #0]
|
||||
vldr s11, [r4, #4]
|
||||
vldr s12, [r5, #0]
|
||||
vldr s13, [r5, #4]
|
||||
vldr s14, [r6, #0]
|
||||
vldr s15, [r6, #4]
|
||||
vldr s2, [r7, #0]
|
||||
vldr s3, [r7, #4]
|
||||
add r7, r7, #8
|
||||
subs r11, r11, #1
|
||||
vmul.f32 s0, s13, s3
|
||||
vmul.f32 s5, s12, s2
|
||||
vmul.f32 s1, s14, s2
|
||||
vmul.f32 s4, s14, s3
|
||||
vmul.f32 s14, s12, s3
|
||||
vmul.f32 s13, s13, s2
|
||||
vmul.f32 s12, s15, s3
|
||||
vmul.f32 s2, s15, s2
|
||||
vsub.f32 s0, s5, s0
|
||||
vadd.f32 s13, s13, s14
|
||||
vadd.f32 s12, s12, s1
|
||||
vsub.f32 s1, s2, s4
|
||||
vadd.f32 s15, s0, s12
|
||||
vsub.f32 s12, s0, s12
|
||||
vadd.f32 s14, s13, s1
|
||||
vsub.f32 s13, s13, s1
|
||||
vadd.f32 s0, s8, s15
|
||||
vadd.f32 s1, s9, s14
|
||||
vadd.f32 s2, s10, s13 @
|
||||
vsub.f32 s4, s8, s15
|
||||
vsub.f32 s3, s11, s12 @
|
||||
vstr s0, [r3, #0]
|
||||
vstr s1, [r3, #4]
|
||||
add r3, r3, #8
|
||||
vsub.f32 s5, s9, s14
|
||||
vsub.f32 s6, s10, s13 @
|
||||
vadd.f32 s7, s11, s12 @
|
||||
vstr s2, [r4, #0]
|
||||
vstr s3, [r4, #4]
|
||||
add r4, r4, #8
|
||||
vstr s4, [r5, #0]
|
||||
vstr s5, [r5, #4]
|
||||
add r5, r5, #8
|
||||
vstr s6, [r6, #0]
|
||||
vstr s7, [r6, #4]
|
||||
add r6, r6, #8
|
||||
bne _vfp_x4_loop
|
||||
bx lr
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _vfp_x8
|
||||
_vfp_x8:
|
||||
#else
|
||||
.globl vfp_x8
|
||||
vfp_x8:
|
||||
#endif
|
||||
mov r11, #0
|
||||
add r3, r0, #0 @ data0
|
||||
add r5, r0, r1, lsl #1 @ data2
|
||||
add r4, r0, r1 @ data1
|
||||
add r7, r5, r1, lsl #1 @ data4
|
||||
add r6, r5, r1 @ data3
|
||||
add r9, r7, r1, lsl #1 @ data6
|
||||
add r8, r7, r1 @ data5
|
||||
add r10, r9, r1 @ data7
|
||||
add r12, r2, #0 @ LUT
|
||||
|
||||
sub r11, r11, r1, lsr #3
|
||||
_vfp_x8_loop:
|
||||
vldr s10, [r3, #0] @ x0-re
|
||||
vldr s8, [r3, #4] @ x0-im
|
||||
vldr s2, [r4, #0] @ x1-re
|
||||
vldr s0, [r4, #4] @ x1-im
|
||||
vldr s6, [r5, #0] @ x2-re
|
||||
vldr s4, [r5, #4] @ x2-im
|
||||
vldr s13, [r6, #0] @ x3-re
|
||||
vldr s15, [r6, #4] @ x3-im
|
||||
vldr s7, [r12]
|
||||
vldr s11, [r12, #4]
|
||||
vldr s5, [r7, #0] @ x4-re
|
||||
vldr s1, [r7, #4] @ x4-im
|
||||
vldr s28, [r9, #0] @ x6-re
|
||||
vldr s18, [r9, #4] @ x6-im
|
||||
adds r11, r11, #1
|
||||
vmul.f32 s14, s15, s7
|
||||
vldr s24, [r12, #12]
|
||||
vmul.f32 s12, s13, s11
|
||||
vmul.f32 s26, s13, s7
|
||||
vldr s13, [r12, #8]
|
||||
vmul.f32 s3, s4, s11
|
||||
vmul.f32 s15, s15, s11
|
||||
vmul.f32 s16, s4, s7
|
||||
vmul.f32 s9, s6, s7
|
||||
vmul.f32 s11, s6, s11
|
||||
vmul.f32 s7, s18, s24
|
||||
vmul.f32 s20, s1, s24
|
||||
vmul.f32 s30, s5, s13
|
||||
vadd.f32 s4, s26, s15
|
||||
vsub.f32 s12, s14, s12
|
||||
vsub.f32 s6, s9, s3
|
||||
vadd.f32 s14, s16, s11
|
||||
vmul.f32 s22, s28, s13
|
||||
vmul.f32 s26, s28, s24
|
||||
vmul.f32 s18, s18, s13
|
||||
vmul.f32 s5, s5, s24
|
||||
vmul.f32 s1, s1, s13
|
||||
vsub.f32 s9, s30, s20
|
||||
vadd.f32 s16, s14, s12
|
||||
vadd.f32 s3, s22, s7
|
||||
vadd.f32 s15, s6, s4
|
||||
vsub.f32 s11, s18, s26
|
||||
vadd.f32 s18, s1, s5
|
||||
vadd.f32 s13, s8, s16
|
||||
vadd.f32 s1, s9, s3
|
||||
vadd.f32 s7, s10, s15
|
||||
vsub.f32 s15, s10, s15
|
||||
vsub.f32 s10, s9, s3
|
||||
vadd.f32 s5, s18, s11
|
||||
vsub.f32 s11, s18, s11
|
||||
vsub.f32 s8, s8, s16
|
||||
vadd.f32 s20, s7, s1
|
||||
vsub.f32 s7, s7, s1
|
||||
vadd.f32 s18, s13, s5
|
||||
vadd.f32 s16, s15, s11 @
|
||||
vsub.f32 s9, s8, s10 @
|
||||
vsub.f32 s3, s13, s5
|
||||
vsub.f32 s1, s15, s11 @
|
||||
vstr s20, [r3]
|
||||
vadd.f32 s8, s8, s10 @
|
||||
vstr s18, [r3, #4]
|
||||
add r3, r3, #8
|
||||
vstr s16, [r5]
|
||||
vstr s9, [r5, #4]
|
||||
add r5, r5, #8
|
||||
vstr s7, [r7]
|
||||
vstr s3, [r7, #4]
|
||||
add r7, r7, #8
|
||||
vstr s1, [r9]
|
||||
vstr s8, [r9, #4]
|
||||
add r9, r9, #8
|
||||
vldr s10, [r8, #0] @ x5-re
|
||||
vldr s8, [r8, #4] @ x5-im
|
||||
vldr s5, [r10, #0] @ x7-re
|
||||
vldr s11, [r10, #4] @ x7-im
|
||||
vldr s1, [r12, #16]
|
||||
vldr s15, [r12, #20]
|
||||
add r12, r12, #24
|
||||
vmul.f32 s9, s5, s1
|
||||
vmul.f32 s3, s11, s15
|
||||
vmul.f32 s13, s10, s1
|
||||
vmul.f32 s7, s8, s15
|
||||
vmul.f32 s5, s5, s15
|
||||
vmul.f32 s11, s11, s1
|
||||
vmul.f32 s10, s10, s15
|
||||
vmul.f32 s15, s8, s1
|
||||
vsub.f32 s1, s14, s12
|
||||
vadd.f32 s8, s9, s3
|
||||
vsub.f32 s3, s6, s4
|
||||
vsub.f32 s12, s13, s7
|
||||
vsub.f32 s5, s11, s5
|
||||
vadd.f32 s7, s15, s10
|
||||
vadd.f32 s4, s2, s1 @
|
||||
vsub.f32 s2, s2, s1 @
|
||||
vsub.f32 s6, s0, s3 @
|
||||
vadd.f32 s10, s12, s8
|
||||
vsub.f32 s9, s12, s8
|
||||
vadd.f32 s0, s0, s3 @
|
||||
vsub.f32 s1, s7, s5
|
||||
vadd.f32 s14, s7, s5
|
||||
vadd.f32 s7, s4, s10
|
||||
vsub.f32 s8, s4, s10
|
||||
vsub.f32 s12, s0, s9 @
|
||||
vadd.f32 s3, s2, s1 @
|
||||
vadd.f32 s5, s6, s14
|
||||
vsub.f32 s4, s6, s14
|
||||
vsub.f32 s2, s2, s1 @
|
||||
vadd.f32 s0, s0, s9 @
|
||||
vstr s7, [r4]
|
||||
vstr s5, [r4, #4]
|
||||
add r4, r4, #8
|
||||
vstr s3, [r6]
|
||||
vstr s12, [r6, #4]
|
||||
add r6, r6, #8
|
||||
vstr s8, [r8]
|
||||
vstr s4, [r8, #4]
|
||||
add r8, r8, #8
|
||||
vstr s2, [r10]
|
||||
vstr s0, [r10, #4]
|
||||
add r10, r10, #8
|
||||
bne _vfp_x8_loop
|
||||
bx lr
|
||||
|
||||
|
||||
.align 4
|
||||
#ifdef __APPLE__
|
||||
.globl _vfp_end
|
||||
_vfp_end:
|
||||
#else
|
||||
.globl vfp_end
|
||||
vfp_end:
|
||||
#endif
|
||||
bx lr
|
||||
Reference in New Issue
Block a user