From f33ddcb66a514abfc94f6d65659c0c43846eace3 Mon Sep 17 00:00:00 2001 From: dreamer Date: Sun, 16 Jul 2023 14:25:14 +0200 Subject: [PATCH 01/12] first setup of rfft~ components --- hvcc/core/hv2ir/HIrRFFT.py | 48 + hvcc/core/json/heavy.ir.json | 54 + hvcc/generators/ir2c/SignalRFFT.py | 73 + hvcc/generators/ir2c/ir2c.py | 3 + hvcc/generators/ir2c/static/HvSignalRFFT.c | 106 + hvcc/generators/ir2c/static/HvSignalRFFT.h | 52 + hvcc/generators/ir2c/static/pffft.c | 1884 +++++++++++++++++ hvcc/generators/ir2c/static/pffft.h | 177 ++ hvcc/generators/ir2c/templates/Heavy_NAME.cpp | 2 + hvcc/interpreters/pd2hv/libs/pd/rfft~.pd | 31 + hvcc/interpreters/pd2hv/libs/pd/rifft~.pd | 31 + 11 files changed, 2461 insertions(+) create mode 100644 hvcc/core/hv2ir/HIrRFFT.py create mode 100644 hvcc/generators/ir2c/SignalRFFT.py create mode 100644 hvcc/generators/ir2c/static/HvSignalRFFT.c create mode 100644 hvcc/generators/ir2c/static/HvSignalRFFT.h create mode 100644 hvcc/generators/ir2c/static/pffft.c create mode 100644 hvcc/generators/ir2c/static/pffft.h create mode 100644 hvcc/interpreters/pd2hv/libs/pd/rfft~.pd create mode 100644 hvcc/interpreters/pd2hv/libs/pd/rifft~.pd diff --git a/hvcc/core/hv2ir/HIrRFFT.py b/hvcc/core/hv2ir/HIrRFFT.py new file mode 100644 index 00000000..be8b961e --- /dev/null +++ b/hvcc/core/hv2ir/HIrRFFT.py @@ -0,0 +1,48 @@ +# Copyright (C) 2014-2018 Enzien Audio, Ltd. +# Copyright (C) 2023 Wasted Audio +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from typing import Dict, Optional + +from .HeavyIrObject import HeavyIrObject +from .HeavyGraph import HeavyGraph + + +class HIrRFFT(HeavyIrObject): + """ __rfft~f + """ + + def __init__( + self, + obj_type: str, + args: Optional[Dict] = None, + graph: Optional[HeavyGraph] = None, + annotations: Optional[Dict] = None + ) -> None: + assert obj_type in {"__rfft~f", "__rifft~f"} + super().__init__(obj_type, args=args, graph=graph, annotations=annotations) + + def reduce(self) -> Optional[tuple]: + if self.graph is not None: + table_obj = self.graph.resolve_object_for_name( + self.args["table"], + ["table", "__table"]) + if table_obj is not None: + self.args["table_id"] = table_obj.id + return ({self}, []) + else: + self.add_error(f"Cannot find table named \"{self.args['table']}\" for object {self}.") + + return None diff --git a/hvcc/core/json/heavy.ir.json b/hvcc/core/json/heavy.ir.json index cdd11435..dcb647c3 100644 --- a/hvcc/core/json/heavy.ir.json +++ b/hvcc/core/json/heavy.ir.json @@ -2223,6 +2223,60 @@ "-->" ] }, + "__rfft~f": { + "inlets": [ + "~f>", + "-->", + "-->" + ], + "ir": { + "control": false, + "signal": true, + "init": true + }, + "outlets": [ + "~f>", + "~f>" + ], + "args": [{ + "name": "table_id", + "value_type": "string", + "description": "", + "default": "", + "required": true + }], + "perf": { + "avx": 0, + "sse": 0 + } + }, + "__rifft~f": { + "inlets": [ + "~f>", + "~f>", + "-->", + "-->" + ], + "ir": { + "control": false, + "signal": true, + "init": true + }, + "outlets": [ + "~f>" + ], + "args": [{ + "name": "table_id", + "value_type": "string", + "description": "", + "default": "", + "required": true + }], + "perf": { + "avx": 0, + "sse": 0 + } + }, "__rpole~f": { "inlets": [ "~f>", diff --git a/hvcc/generators/ir2c/SignalRFFT.py b/hvcc/generators/ir2c/SignalRFFT.py new file mode 100644 index 00000000..1d1514be --- /dev/null +++ b/hvcc/generators/ir2c/SignalRFFT.py @@ -0,0 +1,73 @@ +# Copyright (C) 2014-2018 Enzien Audio, Ltd. +# Copyright (C) 2023 Wasted Audio +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from typing import Dict, List + +from .HeavyObject import HeavyObject + + +class SignalRFFT(HeavyObject): + + c_struct = "SignalRFFT" + preamble = "sRFFT" + + @classmethod + def get_C_header_set(cls) -> set: + return {"HvSignalRFFT.h"} + + @classmethod + def get_C_file_set(cls) -> set: + return {"HvSignalRFFT.h", "HvSignalRFFT.c", "pffft.h", "pffft.c"} + + @classmethod + def get_C_init(cls, obj_type: str, obj_id: int, args: Dict) -> List[str]: + return [ + "sRFFT_init(&sRFFT_{0}, &hTable_{1}, {2});".format( + obj_id, + args["table_id"], + 64) + ] + + @classmethod + def get_C_onMessage(cls, obj_type: str, obj_id: int, inlet_index: int, args: Dict) -> List[str]: + return [ + "sRFFT_onMessage(_c, &Context(_c)->sRFFT_{0}, {1}, m, NULL);".format( + obj_id, + inlet_index) + ] + + @classmethod + def get_C_process(cls, process_dict: Dict, obj_type: str, obj_id: int, args: Dict) -> List[str]: + if obj_type == "__rfft~f": + return [ + "__hv_rfft_f(&sRFFT_{0}, VIf({1}), VOf({2}), VOf({3}));".format( + process_dict["id"], + cls._c_buffer(process_dict["inputBuffers"][0]), + cls._c_buffer(process_dict["outputBuffers"][0]), + cls._c_buffer(process_dict["outputBuffers"][1]) + ) + ] + elif obj_type == "__rifft~f": + return [ + "__hv_rifft_f(&sRFFT_{0}, VIf({1}), VIf({2}), VOf({3}));".format( + process_dict["id"], + cls._c_buffer(process_dict["inputBuffers"][0]), + cls._c_buffer(process_dict["inputBuffers"][1]), + cls._c_buffer(process_dict["outputBuffers"][0]) + ) + ] + else: + raise Exception diff --git a/hvcc/generators/ir2c/ir2c.py b/hvcc/generators/ir2c/ir2c.py index 5671c65a..52ba546d 100644 --- a/hvcc/generators/ir2c/ir2c.py +++ b/hvcc/generators/ir2c/ir2c.py @@ -57,6 +57,7 @@ from hvcc.generators.ir2c.SignalLorenz import SignalLorenz from hvcc.generators.ir2c.SignalMath import SignalMath from hvcc.generators.ir2c.SignalPhasor import SignalPhasor +from hvcc.generators.ir2c.SignalRFFT import SignalRFFT from hvcc.generators.ir2c.SignalRPole import SignalRPole from hvcc.generators.ir2c.SignalSample import SignalSample from hvcc.generators.ir2c.SignalSamphold import SignalSamphold @@ -97,6 +98,8 @@ class ir2c: "__tabwrite_stoppable~f": SignalTabwrite, "__phasor~f": SignalPhasor, "__phasor_k~f": SignalPhasor, + "__rfft~f": SignalRFFT, + "__rifft~f": SignalRFFT, "__sample~f": SignalSample, "__samphold~f": SignalSamphold, "__slice": ControlSlice, diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.c b/hvcc/generators/ir2c/static/HvSignalRFFT.c new file mode 100644 index 00000000..0e04b2c9 --- /dev/null +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.c @@ -0,0 +1,106 @@ +/** + * Copyright (c) 2023 Wasted Audio + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include "HvSignalRFFT.h" +#include "pffft.h" + +hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size) { + o->table = table; + o->setup = pffft_new_setup(size, PFFFT_REAL); + hv_size_t numBytes = hTable_init(&o->inputs, size); + return numBytes; +} + +void sRFFT_free(SignalRFFT *o) { + o->table = NULL; + hTable_free(&o->inputs); +} + +void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, + const HvMessage *m, void *sendMessage) { + switch (letIndex) { + case 1: { + if (msg_isHashLike(m,0)) { + HvTable *table = hv_table_get(_c, msg_getHash(m,0)); + if (table != NULL) { + o->table = table; + if (hTable_getSize(&o->inputs) != hTable_getSize(table)) { + hTable_resize(&o->inputs, + (hv_uint32_t) hv_min_ui(hTable_getSize(&o->inputs), hTable_getSize(table))); + } + } + } + break; + } + case 2: { + if (msg_isFloat(m,0)) { + // rfft size should never exceed the coefficient table size + hTable_resize(&o->inputs, (hv_uint32_t) msg_getFloat(m,0)); + } + break; + } + default: return; + } +} + + +static inline int wrap(const int i, const int n) { + if (i < 0) return (i+n); + if (i >= n) return (i-n); + return i; +} + + +void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut1) { + hv_assert(o->table != NULL); + float *const work = hTable_getBuffer(o->table); + hv_assert(work != NULL); + const int n = hTable_getSize(o->table); // length fir filter + hv_assert((n&HV_N_SIMD_MASK) == 0); // n is a multiple of HV_N_SIMD + + float *const inputs = hTable_getBuffer(&o->inputs); + hv_assert(inputs != NULL); + const int m = hTable_getSize(&o->inputs); // length of input buffer. + hv_assert(m >= n); + const int h_orig = hTable_getHead(&o->inputs); + + + pffft_transform_ordered(o->setup, &bIn, &bOut0, work, PFFFT_FORWARD); + + __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer + hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); +} + + +void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { + hv_assert(o->table != NULL); + float *const work = hTable_getBuffer(o->table); + hv_assert(work != NULL); + const int n = hTable_getSize(o->table); // length fir filter + hv_assert((n&HV_N_SIMD_MASK) == 0); // n is a multiple of HV_N_SIMD + + float *const inputs = hTable_getBuffer(&o->inputs); + hv_assert(inputs != NULL); + const int m = hTable_getSize(&o->inputs); // length of input buffer. + hv_assert(m >= n); + const int h_orig = hTable_getHead(&o->inputs); + + + pffft_transform_ordered(o->setup, &bIn0, &bOut, work, PFFFT_BACKWARD); + + __hv_store_f(inputs+h_orig, bIn0); // store the new input to the inputs buffer + hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); +} diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.h b/hvcc/generators/ir2c/static/HvSignalRFFT.h new file mode 100644 index 00000000..9b2cea27 --- /dev/null +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.h @@ -0,0 +1,52 @@ +/** + * Copyright (c) 2023 Wasted Audio + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _SIGNAL_RFFT_H_ +#define _SIGNAL_RFFT_H_ + +#include "HvHeavyInternal.h" +#include "pffft.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef HV_SIMD_NONE +#define PFFFT_SIMD_DISABLE +#endif + +typedef struct SignalRFFT { + struct HvTable *table; + struct HvTable inputs; + struct PFFFT_Setup *setup; +} SignalRFFT; + +hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *coeffs, const int size); + +void sRFFT_free(SignalRFFT *o); + +void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, + const HvMessage *m, void *sendMessage); + +void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut1); + +void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _SIGNAL_RFFT_H_ diff --git a/hvcc/generators/ir2c/static/pffft.c b/hvcc/generators/ir2c/static/pffft.c new file mode 100644 index 00000000..1b938e66 --- /dev/null +++ b/hvcc/generators/ir2c/static/pffft.c @@ -0,0 +1,1884 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB + (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber + of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + + PFFFT : a Pretty Fast FFT. + + This file is largerly based on the original FFTPACK implementation, modified in + order to take advantage of SIMD instructions of modern CPUs. +*/ + +/* + ChangeLog: + - 2011/10/02, version 1: This is the very first release of this file. +*/ + +#include "pffft.h" +#include +#include +#include +#include + +/* detect compiler flavour */ +#if defined(_MSC_VER) +# define COMPILER_MSVC +#elif defined(__GNUC__) +# define COMPILER_GCC +#endif + +#if defined(COMPILER_GCC) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) +# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; +#elif defined(COMPILER_MSVC) +# define ALWAYS_INLINE(return_type) __forceinline return_type +# define NEVER_INLINE(return_type) __declspec(noinline) return_type +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) +#endif + + +/* + vector support macros: the rest of the code is independant of + SSE/Altivec/NEON -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ + + +// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code +//#define PFFFT_SIMD_DISABLE + +/* + Altivec support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) +#include +typedef vector float v4sf; +# define SIMD_SZ 4 +# define VZERO() ((vector float) vec_splat_u8(0)) +# define VMUL(a,b) vec_madd(a,b, VZERO()) +# define VADD(a,b) vec_add(a,b) +# define VMADD(a,b,c) vec_madd(a,b,c) +# define VSUB(a,b) vec_sub(a,b) +inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } +# define LD_PS1(p) ld_ps1(&p) +# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \ + vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \ + v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ + } +# define VTRANSPOSE4(x0,x1,x2,x3) { \ + v4sf y0 = vec_mergeh(x0, x2); \ + v4sf y1 = vec_mergel(x0, x2); \ + v4sf y2 = vec_mergeh(x1, x3); \ + v4sf y3 = vec_mergel(x1, x3); \ + x0 = vec_mergeh(y0, y2); \ + x1 = vec_mergel(y0, y2); \ + x2 = vec_mergeh(y1, y3); \ + x3 = vec_mergel(y1, y3); \ + } +# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) + +/* + SSE1 support macros +*/ +#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86)) + +#include +typedef __m128 v4sf; +# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. +# define VZERO() _mm_setzero_ps() +# define VMUL(a,b) _mm_mul_ps(a,b) +# define VADD(a,b) _mm_add_ps(a,b) +# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) +# define VSUB(a,b) _mm_sub_ps(a,b) +# define LD_PS1(p) _mm_set1_ps(p) +# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } +# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) +# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) + +/* + ARM NEON support macros +*/ +#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__)) +# include +typedef float32x4_t v4sf; +# define SIMD_SZ 4 +# define VZERO() vdupq_n_f32(0) +# define VMUL(a,b) vmulq_f32(a,b) +# define VADD(a,b) vaddq_f32(a,b) +# define VMADD(a,b,c) vmlaq_f32(c,a,b) +# define VSUB(a,b) vsubq_f32(a,b) +# define LD_PS1(p) vld1q_dup_f32(&(p)) +# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } +# define VTRANSPOSE4(x0,x1,x2,x3) { \ + float32x4x2_t t0_ = vzipq_f32(x0, x2); \ + float32x4x2_t t1_ = vzipq_f32(x1, x3); \ + float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ + float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \ + x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ + } +// marginally faster version +//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } +# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) +#else +# if !defined(PFFFT_SIMD_DISABLE) +# warning "building with simd disabled !\n"; +# define PFFFT_SIMD_DISABLE // fallback to scalar code +# endif +#endif + +// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead +#ifdef PFFFT_SIMD_DISABLE +typedef float v4sf; +# define SIMD_SZ 1 +# define VZERO() 0.f +# define VMUL(a,b) ((a)*(b)) +# define VADD(a,b) ((a)+(b)) +# define VMADD(a,b,c) ((a)*(b)+(c)) +# define VSUB(a,b) ((a)-(b)) +# define LD_PS1(p) (p) +# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) +#endif + +// shortcuts for complex multiplcations +#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } +#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } +#ifndef SVMUL +// multiply a scalar with a vector +#define SVMUL(f,v) VMUL(LD_PS1(f),v) +#endif + +#if !defined(PFFFT_SIMD_DISABLE) +typedef union v4sf_union { + v4sf v; + float f[4]; +} v4sf_union; + +#include + +#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) + +/* detect bugs with the vector support macros */ +void validate_pffft_simd(void) { + float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; + v4sf_union a0, a1, a2, a3, t, u; + memcpy(a0.f, f, 4*sizeof(float)); + memcpy(a1.f, f+4, 4*sizeof(float)); + memcpy(a2.f, f+8, 4*sizeof(float)); + memcpy(a3.f, f+12, 4*sizeof(float)); + + t = a0; u = a1; t.v = VZERO(); + printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); + t.v = VADD(a1.v, a2.v); + printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); + t.v = VMUL(a1.v, a2.v); + printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); + t.v = VMADD(a1.v, a2.v,a0.v); + printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); + + INTERLEAVE2(a1.v,a2.v,t.v,u.v); + printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); + assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); + UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); + printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); + assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); + + t.v=LD_PS1(f[15]); + printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); + assertv4(t, 15, 15, 15, 15); + t.v = VSWAPHL(a1.v, a2.v); + printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); + assertv4(t, 8, 9, 6, 7); + VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); + printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", + a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], + a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); + assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); +} +#else +void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available.. +#endif //!PFFFT_SIMD_DISABLE + +/* SSE and co like 16-bytes aligned pointers */ +#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... +void *pffft_aligned_malloc(size_t nb_bytes) { + void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); + if (!p0) return (void *) 0; + p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); + *((void **) p - 1) = p0; + return p; +} + +void pffft_aligned_free(void *p) { + if (p) free(*((void **) p - 1)); +} + +int pffft_simd_size() { return SIMD_SZ; } + +/* + passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 +*/ +static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { + int k, i; + int l1ido = l1*ido; + if (ido <= 2) { + for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) { + ch[0] = VADD(cc[0], cc[ido+0]); + ch[l1ido] = VSUB(cc[0], cc[ido+0]); + ch[1] = VADD(cc[1], cc[ido+1]); + ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); + } + } else { + for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) { + for (i=0; i 2); + for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { + for (i=0; i 2); + for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { + for (i = 0; i < ido-1; i += 2) { + ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); + ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); + ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); + ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); + tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); + tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); + ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); + ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); + cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); + ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); + cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); + ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); + cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); + ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); + cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); + ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); + dr3 = VSUB(cr3, ci4); + dr4 = VADD(cr3, ci4); + di3 = VADD(ci3, cr4); + di4 = VSUB(ci3, cr4); + dr5 = VADD(cr2, ci5); + dr2 = VSUB(cr2, ci5); + di5 = VSUB(ci2, cr5); + di2 = VADD(ci2, cr5); + wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; + wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1]; + VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch_ref(i - 1, 2) = dr2; + ch_ref(i, 2) = di2; + VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch_ref(i - 1, 3) = dr3; + ch_ref(i, 3) = di3; + VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); + ch_ref(i - 1, 4) = dr4; + ch_ref(i, 4) = di4; + VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); + ch_ref(i - 1, 5) = dr5; + ch_ref(i, 5) = di5; + } + } +#undef ch_ref +#undef cc_ref +} + +static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { + static const float minus_one = -1.f; + int i, k, l1ido = l1*ido; + for (k=0; k < l1ido; k += ido) { + v4sf a = cc[k], b = cc[k + l1ido]; + ch[2*k] = VADD(a, b); + ch[2*(k+ido)-1] = VSUB(a, b); + } + if (ido < 2) return; + if (ido != 2) { + for (k=0; k < l1ido; k += ido) { + for (i=2; i 5) { + wa[i1-1] = wa[i-1]; + wa[i1] = wa[i]; + } + } + l1 = l2; + } +} /* cffti1 */ + + +v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { + v4sf *in = (v4sf*)input_readonly; + v4sf *out = (in == work2 ? work1 : work2); + int nf = ifac[1], k1; + int l1 = 1; + int iw = 0; + assert(in != out && work1 != work2); + for (k1=2; k1<=nf+1; k1++) { + int ip = ifac[k1]; + int l2 = ip*l1; + int ido = n / l2; + int idot = ido + ido; + switch (ip) { + case 5: { + int ix2 = iw + idot; + int ix3 = ix2 + idot; + int ix4 = ix3 + idot; + passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + } break; + case 4: { + int ix2 = iw + idot; + int ix3 = ix2 + idot; + passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign); + } break; + case 2: { + passf2_ps(idot, l1, in, out, &wa[iw], isign); + } break; + case 3: { + int ix2 = iw + idot; + passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign); + } break; + default: + assert(0); + } + l1 = l2; + iw += (ip - 1)*idot; + if (out == work2) { + out = work1; in = work2; + } else { + out = work2; in = work1; + } + } + + return in; /* this is in fact the output .. */ +} + + +struct PFFFT_Setup { + int N; + int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) + int ifac[15]; + pffft_transform_t transform; + v4sf *data; // allocated room for twiddle coefs + float *e; // points into 'data' , N/4*3 elements + float *twiddle; // points into 'data', N/4 elements +}; + +PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { + PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup)); + int k, m; + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } + if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } + //assert((N % 32) == 0); + s->N = N; + s->transform = transform; + /* nb of complex simd vectors */ + s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; + s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf)); + s->e = (float*)s->data; + s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); + + if (transform == PFFFT_REAL) { + for (k=0; k < s->Ncvec; ++k) { + int i = k/SIMD_SZ; + int j = k%SIMD_SZ; + for (m=0; m < SIMD_SZ-1; ++m) { + float A = -2*M_PI*(m+1)*k / N; + s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A); + s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A); + } + } + rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + } else { + for (k=0; k < s->Ncvec; ++k) { + int i = k/SIMD_SZ; + int j = k%SIMD_SZ; + for (m=0; m < SIMD_SZ-1; ++m) { + float A = -2*M_PI*(m+1)*k / N; + s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A); + s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A); + } + } + cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + } + + /* check that N is decomposable with allowed prime factors */ + for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } + if (m != N/SIMD_SZ) { + pffft_destroy_setup(s); s = 0; + } + + return s; +} + + +void pffft_destroy_setup(PFFFT_Setup *s) { + pffft_aligned_free(s->data); + free(s); +} + +#if !defined(PFFFT_SIMD_DISABLE) + +/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ +static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { + v4sf g0, g1; + int k; + INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; + + *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] + for (k=1; k < N; ++k) { + v4sf h0, h1; + INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; + *--out = VSWAPHL(g1, h0); + *--out = VSWAPHL(h0, h1); + g1 = h1; + } + *--out = VSWAPHL(g1, g0); +} + +static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { + v4sf g0, g1, h0, h1; + int k; + g0 = g1 = in[0]; ++in; + for (k=1; k < N; ++k) { + h0 = *in++; h1 = *in++; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; + g1 = h1; + } + h0 = *in++; h1 = g0; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + UNINTERLEAVE2(h0, g1, out[0], out[1]); +} + +void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { + int k, N = setup->N, Ncvec = setup->Ncvec; + const v4sf *vin = (const v4sf*)in; + v4sf *vout = (v4sf*)out; + assert(in != out); + if (setup->transform == PFFFT_REAL) { + int dk = N/32; + if (direction == PFFFT_FORWARD) { + for (k=0; k < dk; ++k) { + INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); + INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); + } + reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2)); + reversed_copy(dk, vin+6, 8, (v4sf*)(out + N)); + } else { + for (k=0; k < dk; ++k) { + UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); + UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); + } + unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8); + unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8); + } + } else { + if (direction == PFFFT_FORWARD) { + for (k=0; k < Ncvec; ++k) { + int kk = (k/4) + (k%4)*(Ncvec/4); + INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); + } + } else { + for (k=0; k < Ncvec; ++k) { + int kk = (k/4) + (k%4)*(Ncvec/4); + UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); + } + } + } +} + +void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + assert(in != out); + for (k=0; k < dk; ++k) { + r0 = in[8*k+0]; i0 = in[8*k+1]; + r1 = in[8*k+2]; i1 = in[8*k+3]; + r2 = in[8*k+4]; i2 = in[8*k+5]; + r3 = in[8*k+6]; i3 = in[8*k+7]; + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]); + VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]); + VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]); + + sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); + si0 = VADD(i0,i2); di0 = VSUB(i0, i2); + si1 = VADD(i1,i3); di1 = VSUB(i1, i3); + + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 -1 0 0 -1 0 1] [r1] + [1 -1 1 -1 0 0 0 0] [r2] + [1 0 -1 0 0 1 0 -1] [r3] + [0 0 0 0 1 1 1 1] * [i0] + [0 1 0 -1 1 0 -1 0] [i1] + [0 0 0 0 1 -1 1 -1] [i2] + [0 -1 0 1 1 0 -1 0] [i3] + */ + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + +void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + assert(in != out); + for (k=0; k < dk; ++k) { + r0 = in[8*k+0]; i0 = in[8*k+1]; + r1 = in[8*k+2]; i1 = in[8*k+3]; + r2 = in[8*k+4]; i2 = in[8*k+5]; + r3 = in[8*k+6]; i3 = in[8*k+7]; + + sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); + si0 = VADD(i0,i2); di0 = VSUB(i0, i2); + si1 = VADD(i1,i3); di1 = VSUB(i1, i3); + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); + + VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]); + VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]); + VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]); + + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + + +static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, + const v4sf *e, v4sf *out) { + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + r0 = *in0; i0 = *in1; + r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 -1 0 0 -1 0 1] [r1] + [1 0 -1 0 0 1 0 -1] [r2] + [1 -1 1 -1 0 0 0 0] [r3] + [0 0 0 0 1 1 1 1] * [i0] + [0 -1 0 1 -1 0 1 0] [i1] + [0 -1 0 1 1 0 -1 0] [i2] + [0 0 0 0 -1 1 -1 1] [i3] + */ + + //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; + //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; + + VCPLXMUL(r1,i1,e[0],e[1]); + VCPLXMUL(r2,i2,e[2],e[3]); + VCPLXMUL(r3,i3,e[4],e[5]); + + //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; + //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; + + sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); + si0 = VADD(i0,i2); di0 = VSUB(i0,i2); + si1 = VADD(i1,i3); di1 = VSUB(i3,i1); + + r0 = VADD(sr0, sr1); + r3 = VSUB(sr0, sr1); + i0 = VADD(si0, si1); + i3 = VSUB(si1, si0); + r1 = VADD(dr0, di1); + r2 = VSUB(dr0, di1); + i1 = VSUB(dr1, di0); + i2 = VADD(dr1, di0); + + *out++ = r0; + *out++ = i0; + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; + +} + +static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + v4sf_union cr, ci, *uout = (v4sf_union*)out; + v4sf save = in[7], zero=VZERO(); + float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3; + static const float s = M_SQRT2/2; + + cr.v = in[0]; ci.v = in[Ncvec*2-1]; + assert(in != out); + pffft_real_finalize_4x4(&zero, &zero, in+1, e, out); + + /* + [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] + + [Xr(1)] ] [1 1 1 1 0 0 0 0] + [Xr(N/4) ] [0 0 0 0 1 s 0 -s] + [Xr(N/2) ] [1 0 -1 0 0 0 0 0] + [Xr(3N/4)] [0 0 0 0 1 -s 0 s] + [Xi(1) ] [1 -1 1 -1 0 0 0 0] + [Xi(N/4) ] [0 0 0 0 0 -s -1 -s] + [Xi(N/2) ] [0 -1 0 1 0 0 0 0] + [Xi(3N/4)] [0 0 0 0 0 -s 1 -s] + */ + + xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0; + xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0; + xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2; + xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2; + xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1; + xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1; + xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3; + xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; + + for (k=1; k < dk; ++k) { + v4sf save_next = in[8*k+7]; + pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, + e + k*6, out + k*8); + save = save_next; + } + +} + +static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, + const v4sf *e, v4sf *out, int first) { + v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 0 -1 0 -1 -1 0] [r1] + [1 -1 -1 1 0 0 0 0] [r2] + [1 0 0 -1 0 1 1 0] [r3] + [0 0 0 0 1 -1 1 -1] * [i0] + [0 -1 1 0 1 0 0 1] [i1] + [0 0 0 0 1 1 -1 -1] [i2] + [0 1 -1 0 1 0 0 1] [i3] + */ + + v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); + v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); + v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); + v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); + + r0 = VADD(sr0, sr1); + r2 = VSUB(sr0, sr1); + r1 = VSUB(dr0, si1); + r3 = VADD(dr0, si1); + i0 = VSUB(di0, di1); + i2 = VADD(di0, di1); + i1 = VSUB(si0, dr1); + i3 = VADD(si0, dr1); + + VCPLXMULCONJ(r1,i1,e[0],e[1]); + VCPLXMULCONJ(r2,i2,e[2],e[3]); + VCPLXMULCONJ(r3,i3,e[4],e[5]); + + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + if (!first) { + *out++ = r0; + *out++ = i0; + } + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; +} + +static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + v4sf_union Xr, Xi, *uout = (v4sf_union*)out; + float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3; + static const float s = M_SQRT2; + assert(in != out); + for (k=0; k < 4; ++k) { + Xr.f[k] = ((float*)in)[8*k]; + Xi.f[k] = ((float*)in)[8*k+4]; + } + + pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values + + /* + [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] + + [cr0] [1 0 2 0 1 0 0 0] + [cr1] [1 0 0 0 -1 0 -2 0] + [cr2] [1 0 -2 0 1 0 0 0] + [cr3] [1 0 0 0 -1 0 2 0] + [ci0] [0 2 0 2 0 0 0 0] + [ci1] [0 s 0 -s 0 -s 0 -s] + [ci2] [0 0 0 0 0 -2 0 2] + [ci3] [0 -s 0 s 0 -s 0 -s] + */ + for (k=1; k < dk; ++k) { + pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0); + } + + cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0; + cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1; + cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2; + cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3; + ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0; + ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1; + ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2; + ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3; +} + + +void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, + pffft_direction_t direction, int ordered) { + int k, Ncvec = setup->Ncvec; + int nf_odd = (setup->ifac[1] & 1); + + // temporary buffer is allocated on the stack if the scratch pointer is NULL + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); + + const v4sf *vinput = (const v4sf*)finput; + v4sf *voutput = (v4sf*)foutput; + v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; + int ib = (nf_odd ^ ordered ? 1 : 0); + + assert(VALIGNED(finput) && VALIGNED(foutput)); + + //assert(finput != foutput); + if (direction == PFFFT_FORWARD) { + ib = !ib; + if (setup->transform == PFFFT_REAL) { + ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); + } else { + v4sf *tmp = buff[ib]; + for (k=0; k < Ncvec; ++k) { + UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); + } + ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], + setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); + pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); + } + if (ordered) { + pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); + } else ib = !ib; + } else { + if (vinput == buff[ib]) { + ib = !ib; // may happen when finput == foutput + } + if (ordered) { + pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); + vinput = buff[ib]; ib = !ib; + } + if (setup->transform == PFFFT_REAL) { + pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); + ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); + ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], + setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); + for (k=0; k < Ncvec; ++k) { + INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); + } + } + } + + if (buff[ib] != voutput) { + /* extra copy required -- this situation should only happen when finput == foutput */ + assert(finput==foutput); + for (k=0; k < Ncvec; ++k) { + v4sf a = buff[ib][2*k], b = buff[ib][2*k+1]; + voutput[2*k] = a; voutput[2*k+1] = b; + } + ib = !ib; + } + assert(buff[ib] == voutput); +} + +void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { + int Ncvec = s->Ncvec; + const v4sf * RESTRICT va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * RESTRICT vab = (v4sf*)ab; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif +#endif + + float ar, ai, br, bi, abr, abi; +#ifndef ZCONVOLVE_USING_INLINE_ASM + v4sf vscal = LD_PS1(scaling); + int i; +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + ar = ((v4sf_union*)va)[0].f[0]; + ai = ((v4sf_union*)va)[1].f[0]; + br = ((v4sf_union*)vb)[0].f[0]; + bi = ((v4sf_union*)vb)[1].f[0]; + abr = ((v4sf_union*)vab)[0].f[0]; + abi = ((v4sf_union*)vab)[1].f[0]; + +#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc + const float *a_ = a, *b_ = b; float *ab_ = ab; + int N = Ncvec; + asm volatile("mov r8, %2 \n" + "vdup.f32 q15, %4 \n" + "1: \n" + "pld [%0,#64] \n" + "pld [%1,#64] \n" + "pld [%2,#64] \n" + "pld [%0,#96] \n" + "pld [%1,#96] \n" + "pld [%2,#96] \n" + "vld1.f32 {q0,q1}, [%0,:128]! \n" + "vld1.f32 {q4,q5}, [%1,:128]! \n" + "vld1.f32 {q2,q3}, [%0,:128]! \n" + "vld1.f32 {q6,q7}, [%1,:128]! \n" + "vld1.f32 {q8,q9}, [r8,:128]! \n" + + "vmul.f32 q10, q0, q4 \n" + "vmul.f32 q11, q0, q5 \n" + "vmul.f32 q12, q2, q6 \n" + "vmul.f32 q13, q2, q7 \n" + "vmls.f32 q10, q1, q5 \n" + "vmla.f32 q11, q1, q4 \n" + "vld1.f32 {q0,q1}, [r8,:128]! \n" + "vmls.f32 q12, q3, q7 \n" + "vmla.f32 q13, q3, q6 \n" + "vmla.f32 q8, q10, q15 \n" + "vmla.f32 q9, q11, q15 \n" + "vmla.f32 q0, q12, q15 \n" + "vmla.f32 q1, q13, q15 \n" + "vst1.f32 {q8,q9},[%2,:128]! \n" + "vst1.f32 {q0,q1},[%2,:128]! \n" + "subs %3, #2 \n" + "bne 1b \n" + : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); +#else // default routine, works fine for non-arm cpus with current compilers + for (i=0; i < Ncvec; i += 2) { + v4sf ar, ai, br, bi; + ar = va[2*i+0]; ai = va[2*i+1]; + br = vb[2*i+0]; bi = vb[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]); + vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]); + ar = va[2*i+2]; ai = va[2*i+3]; + br = vb[2*i+2]; bi = vb[2*i+3]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]); + vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]); + } +#endif + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling; + ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling; + } +} + + +#else // defined(PFFFT_SIMD_DISABLE) + +// standard routine using scalar floats, without SIMD stuff. + +#define pffft_zreorder_nosimd pffft_zreorder +void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { + int k, N = setup->N; + if (setup->transform == PFFFT_COMPLEX) { + for (k=0; k < 2*N; ++k) out[k] = in[k]; + return; + } + else if (direction == PFFFT_FORWARD) { + float x_N = in[N-1]; + for (k=N-1; k > 1; --k) out[k] = in[k-1]; + out[0] = in[0]; + out[1] = x_N; + } else { + float x_N = in[1]; + for (k=1; k < N-1; ++k) out[k] = in[k+1]; + out[0] = in[0]; + out[N-1] = x_N; + } +} + +#define pffft_transform_internal_nosimd pffft_transform_internal +void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, + pffft_direction_t direction, int ordered) { + int Ncvec = setup->Ncvec; + int nf_odd = (setup->ifac[1] & 1); + + // temporary buffer is allocated on the stack if the scratch pointer is NULL + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); + float *buff[2]; + int ib; + if (scratch == 0) scratch = scratch_on_stack; + buff[0] = output; buff[1] = scratch; + + if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered. + ib = (nf_odd ^ ordered ? 1 : 0); + + if (direction == PFFFT_FORWARD) { + if (setup->transform == PFFFT_REAL) { + ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); + } + if (ordered) { + pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; + } + } else { + if (input == buff[ib]) { + ib = !ib; // may happen when finput == foutput + } + if (ordered) { + pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); + input = buff[!ib]; + } + if (setup->transform == PFFFT_REAL) { + ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); + } + } + if (buff[ib] != output) { + int k; + // extra copy required -- this situation should happens only when finput == foutput + assert(input==output); + for (k=0; k < Ncvec; ++k) { + float a = buff[ib][2*k], b = buff[ib][2*k+1]; + output[2*k] = a; output[2*k+1] = b; + } + ib = !ib; + } + assert(buff[ib] == output); +} + +#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate +void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, + float *ab, float scaling) { + int i, Ncvec = s->Ncvec; + + if (s->transform == PFFFT_REAL) { + // take care of the fftpack ordering + ab[0] += a[0]*b[0]*scaling; + ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling; + ++ab; ++a; ++b; --Ncvec; + } + for (i=0; i < Ncvec; ++i) { + float ar, ai, br, bi; + ar = a[2*i+0]; ai = a[2*i+1]; + br = b[2*i+0]; bi = b[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + ab[2*i+0] += ar*scaling; + ab[2*i+1] += ai*scaling; + } +} + +#endif // defined(PFFFT_SIMD_DISABLE) + +void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { + pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0); +} + +void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { + pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1); +} diff --git a/hvcc/generators/ir2c/static/pffft.h b/hvcc/generators/ir2c/static/pffft.h new file mode 100644 index 00000000..5db3d113 --- /dev/null +++ b/hvcc/generators/ir2c/static/pffft.h @@ -0,0 +1,177 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB, + authored by Dr Paul Swarztrauber of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +/* + PFFFT : a Pretty Fast FFT. + + This is basically an adaptation of the single precision fftpack + (v4) as found on netlib taking advantage of SIMD instruction found + on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). + + For architectures where no SIMD instruction is available, the code + falls back to a scalar version. + + Restrictions: + + - 1D transforms only, with 32-bit single precision. + + - supports only transforms for inputs of length N of the form + N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, + 144, 160, etc are all acceptable lengths). Performance is best for + 128<=N<=8192. + + - all (float*) pointers in the functions below are expected to + have an "simd-compatible" alignment, that is 16 bytes on x86 and + powerpc CPUs. + + You can allocate such buffers with the functions + pffft_aligned_malloc / pffft_aligned_free (or with stuff like + posix_memalign..) + +*/ + +#ifndef PFFFT_H +#define PFFFT_H + +#include // for size_t + +#ifdef __cplusplus +extern "C" { +#endif + + /* opaque struct holding internal stuff (precomputed twiddle factors) + this struct can be shared by many threads as it contains only + read-only data. + */ + typedef struct PFFFT_Setup PFFFT_Setup; + + /* direction of the transform */ + typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; + + /* type of transform */ + typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; + + /* + prepare for performing transforms of size N -- the returned + PFFFT_Setup structure is read-only so it can safely be shared by + multiple concurrent threads. + */ + PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); + void pffft_destroy_setup(PFFFT_Setup *); + /* + Perform a Fourier transform , The z-domain data is stored in the + most efficient order for transforming it back, or using it for + convolution. If you need to have its content sorted in the + "usual" way, that is as an array of interleaved complex numbers, + either use pffft_transform_ordered , or call pffft_zreorder after + the forward fft, and before the backward fft. + + Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. + Typically you will want to scale the backward transform by 1/N. + + The 'work' pointer should point to an area of N (2*N for complex + fft) floats, properly aligned. If 'work' is NULL, then stack will + be used instead (this is probably the best strategy for small + FFTs, say for N < 16384). + + input and output may alias. + */ + void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + + /* + Similar to pffft_transform, but makes sure that the output is + ordered as expected (interleaved complex numbers). This is + similar to calling pffft_transform and then pffft_zreorder. + + input and output may alias. + */ + void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + + /* + call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., + PFFFT_FORWARD) if you want to have the frequency components in + the correct "canonical" order, as interleaved complex numbers. + + (for real transforms, both 0-frequency and half frequency + components, which are real, are assembled in the first entry as + F(0)+i*F(n/2+1). Note that the original fftpack did place + F(n/2+1) at the end of the arrays). + + input and output should not alias. + */ + void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and accumulate them into dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab += (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + + /* + the float buffers must have the correct alignment (16-byte boundary + on intel and powerpc). This function may be used to obtain such + correctly aligned buffers. + */ + void *pffft_aligned_malloc(size_t nb_bytes); + void pffft_aligned_free(void *); + + /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ + int pffft_simd_size(void); + +#ifdef __cplusplus +} +#endif + +#endif // PFFFT_H diff --git a/hvcc/generators/ir2c/templates/Heavy_NAME.cpp b/hvcc/generators/ir2c/templates/Heavy_NAME.cpp index c984c5d7..07c6d56f 100644 --- a/hvcc/generators/ir2c/templates/Heavy_NAME.cpp +++ b/hvcc/generators/ir2c/templates/Heavy_NAME.cpp @@ -172,6 +172,8 @@ int Heavy_{{name}}::process(float **inputBuffers, float **outputBuffers, int n) {%- if signal.numInputBuffers > 0 or signal.numOutputBuffers > 0 %} const int n4 = n & ~HV_N_SIMD_MASK; // ensure that the block size is a multiple of HV_N_SIMD + sendFloatToReceiver(0xB5B01859, static_cast(n4)); // send to __hv_blocksize + // temporary signal vars {%- if signal.numTemporaryBuffers.float > 0 %} hv_bufferf_t {% for i in range(signal.numTemporaryBuffers.float) %}Bf{{i}}{% if not loop.last %}, {%endif%}{% endfor %}; diff --git a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd new file mode 100644 index 00000000..b58b323b --- /dev/null +++ b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd @@ -0,0 +1,31 @@ +#N canvas 1920 0 450 300 12; +#X text 24 232 @hv_arg \$2 size int 0 false; +#X text 24 210 @hv_arg \$1 table string "" true; +#X obj 25 15 inlet~; +#X msg 188 105 table \$1 size; +#N canvas 0 22 450 300 @hv_obj 0; +#X obj 146 77 inlet; +#X obj 147 120 outlet; +#X restore 188 128 pd @hv_obj system; +#X text 37 46 measured after initialisation; +#X text 37 35 NOTE(mhroth): ensure that the table size is; +#X obj 25 183 outlet~; +#N canvas 0 22 450 300 @hv_obj 0; +#X obj 55 136 outlet~; +#X obj 55 75 inlet~; +#X obj 190 74 inlet; +#X obj 122 78 inlet; +#X obj 167 137 outlet~; +#X restore 25 154 pd @hv_obj __rfft~f \$0; +#X obj 188 81 symbol \$0; +#X obj 188 60 loadbang; +#X obj 65 110 r __hv_blocksize; +#X obj 175 184 outlet~; +#X connect 2 0 8 0; +#X connect 3 0 4 0; +#X connect 4 0 8 2; +#X connect 8 0 7 0; +#X connect 8 1 12 0; +#X connect 9 0 3 0; +#X connect 10 0 9 0; +#X connect 11 0 8 1; diff --git a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd new file mode 100644 index 00000000..97b39a45 --- /dev/null +++ b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd @@ -0,0 +1,31 @@ +#N canvas 1920 0 582 386 12; +#X text 23 276 @hv_arg \$2 size int 0 false; +#X text 23 254 @hv_arg \$1 table string "" true; +#X obj 24 59 inlet~; +#X msg 275 116 table \$1 size; +#N canvas 0 22 450 300 @hv_obj 0; +#X obj 146 77 inlet; +#X obj 147 120 outlet; +#X restore 275 139 pd @hv_obj system; +#X text 22 24 measured after initialisation; +#X text 22 14 NOTE(mhroth): ensure that the table size is; +#X obj 24 227 outlet~; +#X obj 275 92 symbol \$0; +#X obj 275 71 loadbang; +#X obj 129 120 r __hv_blocksize; +#N canvas 0 22 450 300 @hv_obj 0; +#X obj 55 136 outlet~; +#X obj 55 75 inlet~; +#X obj 247 75 inlet; +#X obj 179 75 inlet; +#X obj 117 75 inlet~; +#X restore 24 197 pd @hv_obj __rifft~f \$0; +#X obj 77 59 inlet~; +#X connect 2 0 11 0; +#X connect 3 0 4 0; +#X connect 4 0 11 3; +#X connect 8 0 3 0; +#X connect 9 0 8 0; +#X connect 10 0 11 2; +#X connect 11 0 7 0; +#X connect 12 0 11 1; From e3c217113b2be7cf16a9e163ef9ac545ef6d894d Mon Sep 17 00:00:00 2001 From: dreamer Date: Tue, 18 Jul 2023 11:14:41 +0200 Subject: [PATCH 02/12] add poc un-interleave code --- hvcc/generators/ir2c/static/HvSignalRFFT.c | 69 ++++++++++++++++++++-- hvcc/generators/ir2c/static/HvSignalRFFT.h | 2 +- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.c b/hvcc/generators/ir2c/static/HvSignalRFFT.c index 0e04b2c9..16658510 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.c +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.c @@ -77,8 +77,32 @@ void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut hv_assert(m >= n); const int h_orig = hTable_getHead(&o->inputs); - - pffft_transform_ordered(o->setup, &bIn, &bOut0, work, PFFFT_FORWARD); + float *const bOut = (float *)(hv_alloca(2*n*sizeof(float))); + + pffft_transform_ordered(o->setup, &bIn, bOut, work, PFFFT_FORWARD); + + // uninterleave result into the output buffers + #if HV_SIMD_SSE || HV_SIMD_AVX + for (int i = 0, j = 0; j < n; j += 4, i += 8) { + __m128 a = _mm_load_ps(bOut+i); // LRLR + __m128 b = _mm_load_ps(bOut+4+i); // LRLR + __m128 x = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0,2,0)); // LLLL + __m128 y = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1,3,1)); // RRRR + _mm_store_ps(bOut0+j, x); + _mm_store_ps(bOut1+j, y); + } + #elif HV_SIMD_NEON + for (int i = 0, j = 0; j < n; j += 4, i += 8) { + float32x4x2_t a = vld2q_f32(bOut+i); // load and uninterleave + vst1q_f32(bOut0+j, a.val[0]); + vst1q_f32(bOut1+j, a.val[1]); + } + #else // HV_SIMD_NONE + for (int j = 0; j < n; ++j) { + bOut0[n+j] = bOut[0+2*j]; + bOut1[n+j] = bOut[1+2*j]; + } + #endif __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); @@ -98,9 +122,46 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut hv_assert(m >= n); const int h_orig = hTable_getHead(&o->inputs); + float *const bIn = (float *)(hv_alloca(2*n*sizeof(float))); + + // interleave the input buffers into the transform buffer + #if HV_SIMD_AVX + for (int i = 0, j = 0; j < n; j += 8, i += 16) { + __m256 x = _mm256_load_ps(bIn0); // LLLLLLLL + __m256 y = _mm256_load_ps(bIn1); // RRRRRRRR + __m256 a = _mm256_unpacklo_ps(x, y); // LRLRLRLR + __m256 b = _mm256_unpackhi_ps(x, y); // LRLRLRLR + _mm256_store_ps(bIn+i, a); + _mm256_store_ps(bIn+8+i, b); + } + #elif HV_SIMD_SSE + for (int i = 0, j = 0; j < n4; j += 4, i += 8) { + __m128 x = _mm_load_ps(bIn0); // LLLL + __m128 y = _mm_load_ps(bIn1); // RRRR + __m128 a = _mm_unpacklo_ps(x, y); // LRLR + __m128 b = _mm_unpackhi_ps(x, y); // LRLR + _mm_store_ps(bIn+i, a); + _mm_store_ps(bIn+4+i, b); + } + #elif HV_SIMD_NEON + // https://community.arm.com/groups/processors/blog/2012/03/13/coding-for-neon--part-5-rearranging-vectors + for (int i = 0, j = 0; j < n4; j += 4, i += 8) { + float32x4_t x = vld1q_f32(bIn0); + float32x4_t y = vld1q_f32(bIn1); + float32x4x2_t z = {x, y}; + vst2q_f32(bIn+i, z); // interleave and store + } + #else // HV_SIMD_NONE + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < n; ++j) { + bIn[0+2*j] = bIn0[n+j]; + bIn[1+2*j] = bIn1[n+j]; + } + } + #endif - pffft_transform_ordered(o->setup, &bIn0, &bOut, work, PFFFT_BACKWARD); + pffft_transform_ordered(o->setup, bIn, &bOut, work, PFFFT_BACKWARD); - __hv_store_f(inputs+h_orig, bIn0); // store the new input to the inputs buffer + // __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); } diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.h b/hvcc/generators/ir2c/static/HvSignalRFFT.h index 9b2cea27..9a78347c 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.h +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.h @@ -34,7 +34,7 @@ typedef struct SignalRFFT { struct PFFFT_Setup *setup; } SignalRFFT; -hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *coeffs, const int size); +hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size); void sRFFT_free(SignalRFFT *o); From 52a28e5224f26310a749bad19dd51aa3fa0c80be Mon Sep 17 00:00:00 2001 From: dreamer Date: Tue, 18 Jul 2023 14:48:54 +0200 Subject: [PATCH 03/12] initialize tables and reduce Ir objects --- hvcc/core/hv2ir/HeavyGraph.py | 1 - hvcc/core/hv2ir/HeavyParser.py | 3 ++ hvcc/core/json/heavy.ir.json | 4 +-- hvcc/interpreters/pd2hv/libs/pd/rfft~.pd | 41 +++++++++++------------ hvcc/interpreters/pd2hv/libs/pd/rifft~.pd | 41 +++++++++++------------ 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/hvcc/core/hv2ir/HeavyGraph.py b/hvcc/core/hv2ir/HeavyGraph.py index 3093998f..8161c774 100644 --- a/hvcc/core/hv2ir/HeavyGraph.py +++ b/hvcc/core/hv2ir/HeavyGraph.py @@ -294,7 +294,6 @@ def resolve_object_for_name( from this graph. Returns None if no objects are available. This is a convenience method. """ - objs = self.resolve_objects_for_name(name, obj_types, local_graph) return objs[0] if len(objs) > 0 else None diff --git a/hvcc/core/hv2ir/HeavyParser.py b/hvcc/core/hv2ir/HeavyParser.py index 2600cad5..5aa42cc6 100644 --- a/hvcc/core/hv2ir/HeavyParser.py +++ b/hvcc/core/hv2ir/HeavyParser.py @@ -25,6 +25,7 @@ from .HIrLorenz import HIrLorenz from .HIrOutlet import HIrOutlet from .HIrPack import HIrPack +from .HIrRFFT import HIrRFFT from .HIrSwitchcase import HIrSwitchcase from .HIrTabhead import HIrTabhead from .HIrTabread import HIrTabread @@ -305,6 +306,8 @@ def reduce(self) -> tuple: "__tabwrite~f": HIrTabwrite, "__tabwrite_stoppable~f": HIrTabwrite, "__tabwrite": HIrTabwrite, + "__rfft~f": HIrRFFT, + "__rifft~f": HIrRFFT, "receive": HLangReceive, "send": HLangSend, "__switchcase": HIrSwitchcase, diff --git a/hvcc/core/json/heavy.ir.json b/hvcc/core/json/heavy.ir.json index dcb647c3..5d6d33e4 100644 --- a/hvcc/core/json/heavy.ir.json +++ b/hvcc/core/json/heavy.ir.json @@ -2239,7 +2239,7 @@ "~f>" ], "args": [{ - "name": "table_id", + "name": "table", "value_type": "string", "description": "", "default": "", @@ -2266,7 +2266,7 @@ "~f>" ], "args": [{ - "name": "table_id", + "name": "table", "value_type": "string", "description": "", "default": "", diff --git a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd index b58b323b..27b85cc7 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd @@ -1,31 +1,30 @@ -#N canvas 1920 0 450 300 12; -#X text 24 232 @hv_arg \$2 size int 0 false; -#X text 24 210 @hv_arg \$1 table string "" true; +#N canvas 796 489 436 234 12; #X obj 25 15 inlet~; -#X msg 188 105 table \$1 size; #N canvas 0 22 450 300 @hv_obj 0; #X obj 146 77 inlet; #X obj 147 120 outlet; -#X restore 188 128 pd @hv_obj system; -#X text 37 46 measured after initialisation; -#X text 37 35 NOTE(mhroth): ensure that the table size is; -#X obj 25 183 outlet~; +#X restore 211 129 pd @hv_obj system; +#X text 80 22 measured after initialisation; +#X text 80 11 NOTE(mhroth): ensure that the table size is; +#X obj 25 184 outlet~; +#X obj 211 48 loadbang; +#X obj 65 111 r __hv_blocksize; +#X obj 211 185 outlet~; +#X msg 211 104 table \$1 size; +#X obj 71 62 table rfft_\$0; +#X obj 211 76 symbol rfft_\$0; #N canvas 0 22 450 300 @hv_obj 0; #X obj 55 136 outlet~; #X obj 55 75 inlet~; #X obj 190 74 inlet; #X obj 122 78 inlet; #X obj 167 137 outlet~; -#X restore 25 154 pd @hv_obj __rfft~f \$0; -#X obj 188 81 symbol \$0; -#X obj 188 60 loadbang; -#X obj 65 110 r __hv_blocksize; -#X obj 175 184 outlet~; -#X connect 2 0 8 0; -#X connect 3 0 4 0; -#X connect 4 0 8 2; -#X connect 8 0 7 0; -#X connect 8 1 12 0; -#X connect 9 0 3 0; -#X connect 10 0 9 0; -#X connect 11 0 8 1; +#X restore 25 155 pd @hv_obj __rfft~f rfft_\$0; +#X connect 0 0 11 0; +#X connect 1 0 11 2; +#X connect 5 0 10 0; +#X connect 6 0 11 1; +#X connect 8 0 1 0; +#X connect 10 0 8 0; +#X connect 11 0 4 0; +#X connect 11 1 7 0; diff --git a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd index 97b39a45..eca3755b 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd @@ -1,31 +1,30 @@ -#N canvas 1920 0 582 386 12; -#X text 23 276 @hv_arg \$2 size int 0 false; -#X text 23 254 @hv_arg \$1 table string "" true; +#N canvas 879 333 486 278 12; #X obj 24 59 inlet~; -#X msg 275 116 table \$1 size; -#N canvas 0 22 450 300 @hv_obj 0; -#X obj 146 77 inlet; -#X obj 147 120 outlet; -#X restore 275 139 pd @hv_obj system; #X text 22 24 measured after initialisation; #X text 22 14 NOTE(mhroth): ensure that the table size is; #X obj 24 227 outlet~; -#X obj 275 92 symbol \$0; -#X obj 275 71 loadbang; -#X obj 129 120 r __hv_blocksize; +#X obj 157 121 r __hv_blocksize; +#X obj 91 59 inlet~; #N canvas 0 22 450 300 @hv_obj 0; #X obj 55 136 outlet~; #X obj 55 75 inlet~; #X obj 247 75 inlet; #X obj 179 75 inlet; #X obj 117 75 inlet~; -#X restore 24 197 pd @hv_obj __rifft~f \$0; -#X obj 77 59 inlet~; -#X connect 2 0 11 0; -#X connect 3 0 4 0; -#X connect 4 0 11 3; -#X connect 8 0 3 0; -#X connect 9 0 8 0; -#X connect 10 0 11 2; -#X connect 11 0 7 0; -#X connect 12 0 11 1; +#X restore 24 197 pd @hv_obj __rifft~f rifft_\$0; +#N canvas 0 22 450 300 @hv_obj 0; +#X obj 146 77 inlet; +#X obj 147 120 outlet; +#X restore 287 144 pd @hv_obj system; +#X obj 287 63 loadbang; +#X msg 287 119 table \$1 size; +#X obj 150 77 table rifft_\$0; +#X obj 287 91 symbol rifft_\$0; +#X connect 0 0 6 0; +#X connect 4 0 6 2; +#X connect 5 0 6 1; +#X connect 6 0 3 0; +#X connect 7 0 6 3; +#X connect 8 0 11 0; +#X connect 9 0 7 0; +#X connect 11 0 9 0; From 08f298e49b3cd6d5d1e23a36900947a823b9a348 Mon Sep 17 00:00:00 2001 From: dreamer Date: Wed, 19 Jul 2023 17:47:34 +0200 Subject: [PATCH 04/12] try to fix some things --- hvcc/generators/ir2c/static/HvSignalRFFT.c | 27 +++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.c b/hvcc/generators/ir2c/static/HvSignalRFFT.c index 16658510..a335f928 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.c +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.c @@ -27,6 +27,7 @@ hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size) { void sRFFT_free(SignalRFFT *o) { o->table = NULL; hTable_free(&o->inputs); + pffft_destroy_setup(o->setup); } void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, @@ -77,7 +78,8 @@ void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut hv_assert(m >= n); const int h_orig = hTable_getHead(&o->inputs); - float *const bOut = (float *)(hv_alloca(2*n*sizeof(float))); + // float *const bOut = (float *)(hv_alloca(2*n*sizeof(float))); + float *const bOut = (float *)(hv_alloca(sizeof(bIn))); pffft_transform_ordered(o->setup, &bIn, bOut, work, PFFFT_FORWARD); @@ -122,13 +124,16 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut hv_assert(m >= n); const int h_orig = hTable_getHead(&o->inputs); - float *const bIn = (float *)(hv_alloca(2*n*sizeof(float))); + float *bIn00 = &bIn0; + float *bIn10 = &bIn1; + // float *const bIn = (float *)(hv_alloca(2*n*sizeof(float))); + float *const bIn = (float *)(hv_alloca(sizeof(bOut))); // interleave the input buffers into the transform buffer #if HV_SIMD_AVX for (int i = 0, j = 0; j < n; j += 8, i += 16) { - __m256 x = _mm256_load_ps(bIn0); // LLLLLLLL - __m256 y = _mm256_load_ps(bIn1); // RRRRRRRR + __m256 x = _mm256_load_ps(bIn00); // LLLLLLLL + __m256 y = _mm256_load_ps(bIn10); // RRRRRRRR __m256 a = _mm256_unpacklo_ps(x, y); // LRLRLRLR __m256 b = _mm256_unpackhi_ps(x, y); // LRLRLRLR _mm256_store_ps(bIn+i, a); @@ -136,8 +141,8 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut } #elif HV_SIMD_SSE for (int i = 0, j = 0; j < n4; j += 4, i += 8) { - __m128 x = _mm_load_ps(bIn0); // LLLL - __m128 y = _mm_load_ps(bIn1); // RRRR + __m128 x = _mm_load_ps(bIn00); // LLLL + __m128 y = _mm_load_ps(bIn10); // RRRR __m128 a = _mm_unpacklo_ps(x, y); // LRLR __m128 b = _mm_unpackhi_ps(x, y); // LRLR _mm_store_ps(bIn+i, a); @@ -146,21 +151,21 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut #elif HV_SIMD_NEON // https://community.arm.com/groups/processors/blog/2012/03/13/coding-for-neon--part-5-rearranging-vectors for (int i = 0, j = 0; j < n4; j += 4, i += 8) { - float32x4_t x = vld1q_f32(bIn0); - float32x4_t y = vld1q_f32(bIn1); + float32x4_t x = vld1q_f32(bIn00); + float32x4_t y = vld1q_f32(bIn10); float32x4x2_t z = {x, y}; vst2q_f32(bIn+i, z); // interleave and store } #else // HV_SIMD_NONE for (int i = 0; i < 2; ++i) { for (int j = 0; j < n; ++j) { - bIn[0+2*j] = bIn0[n+j]; - bIn[1+2*j] = bIn1[n+j]; + bIn[0+2*j] = bIn00[n+j]; + bIn[1+2*j] = bIn10[n+j]; } } #endif - pffft_transform_ordered(o->setup, bIn, &bOut, work, PFFFT_BACKWARD); + pffft_transform_ordered(o->setup, bIn, bOut, work, PFFFT_BACKWARD); // __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); From b57b589dad323091b0ed84f51783567bee75b849 Mon Sep 17 00:00:00 2001 From: dreamer Date: Fri, 17 Nov 2023 07:27:45 +0100 Subject: [PATCH 05/12] remove simd specifics; focus on naive implementation first --- hvcc/generators/ir2c/static/HvSignalRFFT.c | 45 ---------------------- 1 file changed, 45 deletions(-) diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.c b/hvcc/generators/ir2c/static/HvSignalRFFT.c index a335f928..707e5350 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.c +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.c @@ -84,27 +84,10 @@ void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut pffft_transform_ordered(o->setup, &bIn, bOut, work, PFFFT_FORWARD); // uninterleave result into the output buffers - #if HV_SIMD_SSE || HV_SIMD_AVX - for (int i = 0, j = 0; j < n; j += 4, i += 8) { - __m128 a = _mm_load_ps(bOut+i); // LRLR - __m128 b = _mm_load_ps(bOut+4+i); // LRLR - __m128 x = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0,2,0)); // LLLL - __m128 y = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1,3,1)); // RRRR - _mm_store_ps(bOut0+j, x); - _mm_store_ps(bOut1+j, y); - } - #elif HV_SIMD_NEON - for (int i = 0, j = 0; j < n; j += 4, i += 8) { - float32x4x2_t a = vld2q_f32(bOut+i); // load and uninterleave - vst1q_f32(bOut0+j, a.val[0]); - vst1q_f32(bOut1+j, a.val[1]); - } - #else // HV_SIMD_NONE for (int j = 0; j < n; ++j) { bOut0[n+j] = bOut[0+2*j]; bOut1[n+j] = bOut[1+2*j]; } - #endif __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); @@ -130,40 +113,12 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut float *const bIn = (float *)(hv_alloca(sizeof(bOut))); // interleave the input buffers into the transform buffer - #if HV_SIMD_AVX - for (int i = 0, j = 0; j < n; j += 8, i += 16) { - __m256 x = _mm256_load_ps(bIn00); // LLLLLLLL - __m256 y = _mm256_load_ps(bIn10); // RRRRRRRR - __m256 a = _mm256_unpacklo_ps(x, y); // LRLRLRLR - __m256 b = _mm256_unpackhi_ps(x, y); // LRLRLRLR - _mm256_store_ps(bIn+i, a); - _mm256_store_ps(bIn+8+i, b); - } - #elif HV_SIMD_SSE - for (int i = 0, j = 0; j < n4; j += 4, i += 8) { - __m128 x = _mm_load_ps(bIn00); // LLLL - __m128 y = _mm_load_ps(bIn10); // RRRR - __m128 a = _mm_unpacklo_ps(x, y); // LRLR - __m128 b = _mm_unpackhi_ps(x, y); // LRLR - _mm_store_ps(bIn+i, a); - _mm_store_ps(bIn+4+i, b); - } - #elif HV_SIMD_NEON - // https://community.arm.com/groups/processors/blog/2012/03/13/coding-for-neon--part-5-rearranging-vectors - for (int i = 0, j = 0; j < n4; j += 4, i += 8) { - float32x4_t x = vld1q_f32(bIn00); - float32x4_t y = vld1q_f32(bIn10); - float32x4x2_t z = {x, y}; - vst2q_f32(bIn+i, z); // interleave and store - } - #else // HV_SIMD_NONE for (int i = 0; i < 2; ++i) { for (int j = 0; j < n; ++j) { bIn[0+2*j] = bIn00[n+j]; bIn[1+2*j] = bIn10[n+j]; } } - #endif pffft_transform_ordered(o->setup, bIn, bOut, work, PFFFT_BACKWARD); From 33bc62174ec6bcf198e8257bdc024e1f809d9772 Mon Sep 17 00:00:00 2001 From: dreamer Date: Fri, 17 Nov 2023 07:32:27 +0100 Subject: [PATCH 06/12] modify patch comments --- hvcc/interpreters/pd2hv/libs/pd/rfft~.pd | 18 +++++++++--------- hvcc/interpreters/pd2hv/libs/pd/rifft~.pd | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd index 27b85cc7..2079e285 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd @@ -5,7 +5,6 @@ #X obj 147 120 outlet; #X restore 211 129 pd @hv_obj system; #X text 80 22 measured after initialisation; -#X text 80 11 NOTE(mhroth): ensure that the table size is; #X obj 25 184 outlet~; #X obj 211 48 loadbang; #X obj 65 111 r __hv_blocksize; @@ -20,11 +19,12 @@ #X obj 122 78 inlet; #X obj 167 137 outlet~; #X restore 25 155 pd @hv_obj __rfft~f rfft_\$0; -#X connect 0 0 11 0; -#X connect 1 0 11 2; -#X connect 5 0 10 0; -#X connect 6 0 11 1; -#X connect 8 0 1 0; -#X connect 10 0 8 0; -#X connect 11 0 4 0; -#X connect 11 1 7 0; +#X text 80 11 NOTE(dreamer): ensure that the table size is; +#X connect 0 0 10 0; +#X connect 1 0 10 2; +#X connect 4 0 9 0; +#X connect 5 0 10 1; +#X connect 7 0 1 0; +#X connect 9 0 7 0; +#X connect 10 0 3 0; +#X connect 10 1 6 0; diff --git a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd index eca3755b..437506f8 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd @@ -1,4 +1,4 @@ -#N canvas 879 333 486 278 12; +#N canvas 1038 308 486 278 12; #X obj 24 59 inlet~; #X text 22 24 measured after initialisation; #X text 22 14 NOTE(mhroth): ensure that the table size is; From 52272f6e4c5566a29526bacf6c7ea8b167e7e7a9 Mon Sep 17 00:00:00 2001 From: dreamer Date: Sat, 18 Nov 2023 11:28:22 +0100 Subject: [PATCH 07/12] make sure pffft has M_PI, M_SQRT2 and disabled simd --- hvcc/generators/ir2c/static/HvSignalRFFT.h | 3 - hvcc/generators/ir2c/static/pffft.c | 164 +++++++++++---------- 2 files changed, 87 insertions(+), 80 deletions(-) diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.h b/hvcc/generators/ir2c/static/HvSignalRFFT.h index 9a78347c..dd681410 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.h +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.h @@ -24,9 +24,6 @@ extern "C" { #endif -#ifdef HV_SIMD_NONE -#define PFFFT_SIMD_DISABLE -#endif typedef struct SignalRFFT { struct HvTable *table; diff --git a/hvcc/generators/ir2c/static/pffft.c b/hvcc/generators/ir2c/static/pffft.c index 1b938e66..171daee0 100644 --- a/hvcc/generators/ir2c/static/pffft.c +++ b/hvcc/generators/ir2c/static/pffft.c @@ -25,7 +25,7 @@ Laboratory, the University Corporation for Atmospheric Research, nor the names of its sponsors or contributors may be used to endorse or promote products derived from this Software without - specific prior written permission. + specific prior written permission. - Redistributions of source code must retain the above copyright notices, this list of conditions, and the disclaimer below. @@ -53,7 +53,7 @@ */ /* - ChangeLog: + ChangeLog: - 2011/10/02, version 1: This is the very first release of this file. */ @@ -82,11 +82,21 @@ # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) #endif +#ifndef M_PI +#define M_PI 3.14159265358979323846 /* pi */ +#endif + +#ifndef M_SQRT2 +#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ +#endif + +#define PFFFT_SIMD_DISABLE 1 -/* + +/* vector support macros: the rest of the code is independant of SSE/Altivec/NEON -- adding support for other platforms with 4-element - vectors should be limited to these macros + vectors should be limited to these macros */ @@ -94,7 +104,7 @@ //#define PFFFT_SIMD_DISABLE /* - Altivec support macros + Altivec support macros */ #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) #include @@ -213,7 +223,7 @@ typedef union v4sf_union { /* detect bugs with the vector support macros */ void validate_pffft_simd(void) { float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; - v4sf_union a0, a1, a2, a3, t, u; + v4sf_union a0, a1, a2, a3, t, u; memcpy(a0.f, f, 4*sizeof(float)); memcpy(a1.f, f+4, 4*sizeof(float)); memcpy(a2.f, f+8, 4*sizeof(float)); @@ -242,9 +252,9 @@ void validate_pffft_simd(void) { printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 8, 9, 6, 7); VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); - printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", - a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], - a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); + printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", + a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], + a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); } #else @@ -324,7 +334,7 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, di3 = VSUB(ci2, cr3); wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); - ch[i+l1ido] = dr2; + ch[i+l1ido] = dr2; ch[i+l1ido + 1] = di2; VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); ch[i+2*l1ido] = dr3; @@ -356,7 +366,7 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, ch[1*l1ido + 0] = VADD(tr1, tr4); ch[1*l1ido + 1] = VADD(ti1, ti4); ch[2*l1ido + 0] = VSUB(tr2, tr3); - ch[2*l1ido + 1] = VSUB(ti2, ti3); + ch[2*l1ido + 1] = VSUB(ti2, ti3); ch[3*l1ido + 0] = VSUB(tr1, tr4); ch[3*l1ido + 1] = VSUB(ti1, ti4); } @@ -405,8 +415,8 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5 */ static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, - const float *wa1, const float *wa2, - const float *wa3, const float *wa4, float fsign) { + const float *wa1, const float *wa2, + const float *wa3, const float *wa4, float fsign) { static const float tr11 = .309016994374947f; const float ti11 = .951056516295154f*fsign; static const float tr12 = -.809016994374947f; @@ -485,7 +495,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4 for (i=2; i0); } if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } //assert((N % 32) == 0); s->N = N; - s->transform = transform; + s->transform = transform; /* nb of complex simd vectors */ s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf)); s->e = (float*)s->data; - s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); + s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); if (transform == PFFFT_REAL) { for (k=0; k < s->Ncvec; ++k) { @@ -1270,7 +1280,7 @@ static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { v4sf g0, g1; int k; INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; - + *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] for (k=1; k < N; ++k) { v4sf h0, h1; @@ -1323,12 +1333,12 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc } } else { if (direction == PFFFT_FORWARD) { - for (k=0; k < Ncvec; ++k) { + for (k=0; k < Ncvec; ++k) { int kk = (k/4) + (k%4)*(Ncvec/4); INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); } } else { - for (k=0; k < Ncvec; ++k) { + for (k=0; k < Ncvec; ++k) { int kk = (k/4) + (k%4)*(Ncvec/4); UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); } @@ -1341,7 +1351,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; assert(in != out); - for (k=0; k < dk; ++k) { + for (k=0; k < dk; ++k) { r0 = in[8*k+0]; i0 = in[8*k+1]; r1 = in[8*k+2]; i1 = in[8*k+3]; r2 = in[8*k+4]; i2 = in[8*k+5]; @@ -1359,7 +1369,7 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { /* transformation for each column is: - + [1 1 1 1 0 0 0 0] [r0] [1 0 -1 0 0 -1 0 1] [r1] [1 -1 1 -1 0 0 0 0] [r2] @@ -1367,14 +1377,14 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { [0 0 0 0 1 1 1 1] * [i0] [0 1 0 -1 1 0 -1 0] [i1] [0 0 0 0 1 -1 1 -1] [i2] - [0 -1 0 1 1 0 -1 0] [i3] + [0 -1 0 1 1 0 -1 0] [i3] */ - + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); - + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; } @@ -1385,7 +1395,7 @@ void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; assert(in != out); - for (k=0; k < dk; ++k) { + for (k=0; k < dk; ++k) { r0 = in[8*k+0]; i0 = in[8*k+1]; r1 = in[8*k+2]; i1 = in[8*k+3]; r2 = in[8*k+4]; i2 = in[8*k+5]; @@ -1422,7 +1432,7 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf * r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; VTRANSPOSE4(r0,r1,r2,r3); VTRANSPOSE4(i0,i1,i2,i3); - + /* transformation for each column is: @@ -1433,9 +1443,9 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf * [0 0 0 0 1 1 1 1] * [i0] [0 -1 0 1 -1 0 1 0] [i1] [0 -1 0 1 1 0 -1 0] [i2] - [0 0 0 0 -1 1 -1 1] [i3] + [0 0 0 0 -1 1 -1 1] [i3] */ - + //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; @@ -1446,9 +1456,9 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf * //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; - sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); + sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); - si0 = VADD(i0,i2); di0 = VSUB(i0,i2); + si0 = VADD(i0,i2); di0 = VSUB(i0,i2); si1 = VADD(i1,i3); di1 = VSUB(i3,i1); r0 = VADD(sr0, sr1); @@ -1504,7 +1514,7 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1; xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1; xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3; - xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; + xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; for (k=1; k < dk; ++k) { v4sf save_next = in[8*k+7]; @@ -1515,7 +1525,7 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o } -static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, +static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out, int first) { v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; /* @@ -1528,12 +1538,12 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, [0 0 0 0 1 -1 1 -1] * [i0] [0 -1 1 0 1 0 0 1] [i1] [0 0 0 0 1 1 -1 -1] [i2] - [0 1 -1 0 1 0 0 1] [i3] + [0 1 -1 0 1 0 0 1] [i3] */ - v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); + v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); - v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); + v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); r0 = VADD(sr0, sr1); @@ -1591,7 +1601,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf [ci2] [0 0 0 0 0 -2 0 2] [ci3] [0 -s 0 s 0 -s 0 -s] */ - for (k=1; k < dk; ++k) { + for (k=1; k < dk; ++k) { pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0); } @@ -1625,44 +1635,44 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo //assert(finput != foutput); if (direction == PFFFT_FORWARD) { ib = !ib; - if (setup->transform == PFFFT_REAL) { + if (setup->transform == PFFFT_REAL) { ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); } else { v4sf *tmp = buff[ib]; for (k=0; k < Ncvec; ++k) { UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); } - ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], + ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); } if (ordered) { - pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); + pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); } else ib = !ib; } else { - if (vinput == buff[ib]) { + if (vinput == buff[ib]) { ib = !ib; // may happen when finput == foutput } if (ordered) { - pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); + pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); vinput = buff[ib]; ib = !ib; } if (setup->transform == PFFFT_REAL) { pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); - ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], + ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); } else { pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); - ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], + ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); for (k=0; k < Ncvec; ++k) { INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); } } } - + if (buff[ib] != voutput) { /* extra copy required -- this situation should only happen when finput == foutput */ assert(finput==foutput); @@ -1712,7 +1722,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, bi = ((v4sf_union*)vb)[1].f[0]; abr = ((v4sf_union*)vab)[0].f[0]; abi = ((v4sf_union*)vab)[1].f[0]; - + #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc const float *a_ = a, *b_ = b; float *ab_ = ab; int N = Ncvec; @@ -1730,11 +1740,11 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, "vld1.f32 {q2,q3}, [%0,:128]! \n" "vld1.f32 {q6,q7}, [%1,:128]! \n" "vld1.f32 {q8,q9}, [r8,:128]! \n" - + "vmul.f32 q10, q0, q4 \n" "vmul.f32 q11, q0, q5 \n" - "vmul.f32 q12, q2, q6 \n" - "vmul.f32 q13, q2, q7 \n" + "vmul.f32 q12, q2, q6 \n" + "vmul.f32 q13, q2, q7 \n" "vmls.f32 q10, q1, q5 \n" "vmla.f32 q11, q1, q4 \n" "vld1.f32 {q0,q1}, [r8,:128]! \n" @@ -1784,12 +1794,12 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pfff } else if (direction == PFFFT_FORWARD) { float x_N = in[N-1]; - for (k=N-1; k > 1; --k) out[k] = in[k-1]; + for (k=N-1; k > 1; --k) out[k] = in[k-1]; out[0] = in[0]; out[1] = x_N; } else { float x_N = in[1]; - for (k=1; k < N-1; ++k) out[k] = in[k+1]; + for (k=1; k < N-1; ++k) out[k] = in[k+1]; out[0] = in[0]; out[N-1] = x_N; } @@ -1813,29 +1823,29 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo ib = (nf_odd ^ ordered ? 1 : 0); if (direction == PFFFT_FORWARD) { - if (setup->transform == PFFFT_REAL) { + if (setup->transform == PFFFT_REAL) { ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); } else { - ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); } if (ordered) { pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; } - } else { - if (input == buff[ib]) { + } else { + if (input == buff[ib]) { ib = !ib; // may happen when finput == foutput } if (ordered) { - pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); + pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); input = buff[!ib]; } if (setup->transform == PFFFT_REAL) { - ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], + ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); } else { - ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); } } From e59499e4b07505db5a27476c3f605f325ede102d Mon Sep 17 00:00:00 2001 From: dromer Date: Wed, 29 Jan 2025 15:29:32 +0100 Subject: [PATCH 08/12] update signatures to latest dev --- hvcc/generators/ir2c/SignalRFFT.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/hvcc/generators/ir2c/SignalRFFT.py b/hvcc/generators/ir2c/SignalRFFT.py index 1d1514be..f5901d34 100644 --- a/hvcc/generators/ir2c/SignalRFFT.py +++ b/hvcc/generators/ir2c/SignalRFFT.py @@ -18,6 +18,8 @@ from .HeavyObject import HeavyObject +from hvcc.types.IR import IRSignalList + class SignalRFFT(HeavyObject): @@ -33,7 +35,7 @@ def get_C_file_set(cls) -> set: return {"HvSignalRFFT.h", "HvSignalRFFT.c", "pffft.h", "pffft.c"} @classmethod - def get_C_init(cls, obj_type: str, obj_id: int, args: Dict) -> List[str]: + def get_C_init(cls, obj_type: str, obj_id: str, args: Dict) -> List[str]: return [ "sRFFT_init(&sRFFT_{0}, &hTable_{1}, {2});".format( obj_id, @@ -42,7 +44,7 @@ def get_C_init(cls, obj_type: str, obj_id: int, args: Dict) -> List[str]: ] @classmethod - def get_C_onMessage(cls, obj_type: str, obj_id: int, inlet_index: int, args: Dict) -> List[str]: + def get_C_onMessage(cls, obj_type: str, obj_id: str, inlet_index: int, args: Dict) -> List[str]: return [ "sRFFT_onMessage(_c, &Context(_c)->sRFFT_{0}, {1}, m, NULL);".format( obj_id, @@ -50,23 +52,23 @@ def get_C_onMessage(cls, obj_type: str, obj_id: int, inlet_index: int, args: Dic ] @classmethod - def get_C_process(cls, process_dict: Dict, obj_type: str, obj_id: int, args: Dict) -> List[str]: + def get_C_process(cls, process_dict: IRSignalList, obj_type: str, obj_id: str, args: Dict) -> List[str]: if obj_type == "__rfft~f": return [ "__hv_rfft_f(&sRFFT_{0}, VIf({1}), VOf({2}), VOf({3}));".format( - process_dict["id"], - cls._c_buffer(process_dict["inputBuffers"][0]), - cls._c_buffer(process_dict["outputBuffers"][0]), - cls._c_buffer(process_dict["outputBuffers"][1]) + process_dict.id, + cls._c_buffer(process_dict.inputBuffers[0]), + cls._c_buffer(process_dict.outputBuffers[0]), + cls._c_buffer(process_dict.outputBuffers[1]) ) ] elif obj_type == "__rifft~f": return [ "__hv_rifft_f(&sRFFT_{0}, VIf({1}), VIf({2}), VOf({3}));".format( - process_dict["id"], - cls._c_buffer(process_dict["inputBuffers"][0]), - cls._c_buffer(process_dict["inputBuffers"][1]), - cls._c_buffer(process_dict["outputBuffers"][0]) + process_dict.id, + cls._c_buffer(process_dict.inputBuffers[0]), + cls._c_buffer(process_dict.inputBuffers[1]), + cls._c_buffer(process_dict.outputBuffers[0]) ) ] else: From c0e218b09d0312586393c5d66ac57f96d26ed531 Mon Sep 17 00:00:00 2001 From: dreamer Date: Sat, 27 Dec 2025 03:12:34 +0100 Subject: [PATCH 09/12] get block_size from graph; restart DSP --- hvcc/core/hv2ir/HIrRFFT.py | 1 + hvcc/core/hv2ir/HeavyParser.py | 2 + hvcc/generators/ir2c/SignalRFFT.py | 4 +- hvcc/generators/ir2c/static/HvSignalRFFT.c | 7 +- hvcc/generators/ir2c/static/HvSignalRFFT.h | 2 - hvcc/generators/ir2c/static/pffft.c | 1894 ----------------- hvcc/generators/ir2c/static/pffft.h | 177 -- hvcc/generators/ir2c/templates/Heavy_NAME.cpp | 2 - hvcc/interpreters/pd2hv/PdGraph.py | 4 + hvcc/interpreters/pd2hv/PdParser.py | 4 + hvcc/interpreters/pd2hv/libs/pd/rfft~.pd | 21 +- hvcc/interpreters/pd2hv/libs/pd/rifft~.pd | 27 +- 12 files changed, 36 insertions(+), 2109 deletions(-) delete mode 100644 hvcc/generators/ir2c/static/pffft.c delete mode 100644 hvcc/generators/ir2c/static/pffft.h diff --git a/hvcc/core/hv2ir/HIrRFFT.py b/hvcc/core/hv2ir/HIrRFFT.py index be8b961e..8bfdd716 100644 --- a/hvcc/core/hv2ir/HIrRFFT.py +++ b/hvcc/core/hv2ir/HIrRFFT.py @@ -36,6 +36,7 @@ def __init__( def reduce(self) -> Optional[tuple]: if self.graph is not None: + self.args["block_size"] = self.graph.args["block_size"] table_obj = self.graph.resolve_object_for_name( self.args["table"], ["table", "__table"]) diff --git a/hvcc/core/hv2ir/HeavyParser.py b/hvcc/core/hv2ir/HeavyParser.py index 4f50834a..14901afd 100644 --- a/hvcc/core/hv2ir/HeavyParser.py +++ b/hvcc/core/hv2ir/HeavyParser.py @@ -115,6 +115,8 @@ def graph_from_object( """ # resolve default graph arguments graph_args = graph_args or {} + if(json_heavy["block_size"] is not None): + graph_args["block_size"] = int(json_heavy["block_size"]) for a in json_heavy["args"]: if a["name"] not in graph_args: if a["required"]: diff --git a/hvcc/generators/ir2c/SignalRFFT.py b/hvcc/generators/ir2c/SignalRFFT.py index f5901d34..49f8a100 100644 --- a/hvcc/generators/ir2c/SignalRFFT.py +++ b/hvcc/generators/ir2c/SignalRFFT.py @@ -32,7 +32,7 @@ def get_C_header_set(cls) -> set: @classmethod def get_C_file_set(cls) -> set: - return {"HvSignalRFFT.h", "HvSignalRFFT.c", "pffft.h", "pffft.c"} + return {"HvSignalRFFT.h", "HvSignalRFFT.c"} @classmethod def get_C_init(cls, obj_type: str, obj_id: str, args: Dict) -> List[str]: @@ -40,7 +40,7 @@ def get_C_init(cls, obj_type: str, obj_id: str, args: Dict) -> List[str]: "sRFFT_init(&sRFFT_{0}, &hTable_{1}, {2});".format( obj_id, args["table_id"], - 64) + args["block_size"]) ] @classmethod diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.c b/hvcc/generators/ir2c/static/HvSignalRFFT.c index 707e5350..178442d2 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.c +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.c @@ -15,11 +15,9 @@ */ #include "HvSignalRFFT.h" -#include "pffft.h" hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size) { o->table = table; - o->setup = pffft_new_setup(size, PFFFT_REAL); hv_size_t numBytes = hTable_init(&o->inputs, size); return numBytes; } @@ -27,7 +25,6 @@ hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size) { void sRFFT_free(SignalRFFT *o) { o->table = NULL; hTable_free(&o->inputs); - pffft_destroy_setup(o->setup); } void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, @@ -81,7 +78,7 @@ void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut // float *const bOut = (float *)(hv_alloca(2*n*sizeof(float))); float *const bOut = (float *)(hv_alloca(sizeof(bIn))); - pffft_transform_ordered(o->setup, &bIn, bOut, work, PFFFT_FORWARD); + // do fft stuff // uninterleave result into the output buffers for (int j = 0; j < n; ++j) { @@ -120,7 +117,7 @@ void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut } } - pffft_transform_ordered(o->setup, bIn, bOut, work, PFFFT_BACKWARD); + // do ifft stuff // __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.h b/hvcc/generators/ir2c/static/HvSignalRFFT.h index dd681410..f47e5c76 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.h +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.h @@ -18,7 +18,6 @@ #define _SIGNAL_RFFT_H_ #include "HvHeavyInternal.h" -#include "pffft.h" #ifdef __cplusplus extern "C" { @@ -28,7 +27,6 @@ extern "C" { typedef struct SignalRFFT { struct HvTable *table; struct HvTable inputs; - struct PFFFT_Setup *setup; } SignalRFFT; hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size); diff --git a/hvcc/generators/ir2c/static/pffft.c b/hvcc/generators/ir2c/static/pffft.c deleted file mode 100644 index 171daee0..00000000 --- a/hvcc/generators/ir2c/static/pffft.c +++ /dev/null @@ -1,1894 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB - (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber - of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - - - PFFFT : a Pretty Fast FFT. - - This file is largerly based on the original FFTPACK implementation, modified in - order to take advantage of SIMD instructions of modern CPUs. -*/ - -/* - ChangeLog: - - 2011/10/02, version 1: This is the very first release of this file. -*/ - -#include "pffft.h" -#include -#include -#include -#include - -/* detect compiler flavour */ -#if defined(_MSC_VER) -# define COMPILER_MSVC -#elif defined(__GNUC__) -# define COMPILER_GCC -#endif - -#if defined(COMPILER_GCC) -# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) -# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) -# define RESTRICT __restrict -# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; -#elif defined(COMPILER_MSVC) -# define ALWAYS_INLINE(return_type) __forceinline return_type -# define NEVER_INLINE(return_type) __declspec(noinline) return_type -# define RESTRICT __restrict -# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) -#endif - -#ifndef M_PI -#define M_PI 3.14159265358979323846 /* pi */ -#endif - -#ifndef M_SQRT2 -#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ -#endif - -#define PFFFT_SIMD_DISABLE 1 - - -/* - vector support macros: the rest of the code is independant of - SSE/Altivec/NEON -- adding support for other platforms with 4-element - vectors should be limited to these macros -*/ - - -// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code -//#define PFFFT_SIMD_DISABLE - -/* - Altivec support macros -*/ -#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) -#include -typedef vector float v4sf; -# define SIMD_SZ 4 -# define VZERO() ((vector float) vec_splat_u8(0)) -# define VMUL(a,b) vec_madd(a,b, VZERO()) -# define VADD(a,b) vec_add(a,b) -# define VMADD(a,b,c) vec_madd(a,b,c) -# define VSUB(a,b) vec_sub(a,b) -inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } -# define LD_PS1(p) ld_ps1(&p) -# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } -# define UNINTERLEAVE2(in1, in2, out1, out2) { \ - vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \ - vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \ - v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ - } -# define VTRANSPOSE4(x0,x1,x2,x3) { \ - v4sf y0 = vec_mergeh(x0, x2); \ - v4sf y1 = vec_mergel(x0, x2); \ - v4sf y2 = vec_mergeh(x1, x3); \ - v4sf y3 = vec_mergel(x1, x3); \ - x0 = vec_mergeh(y0, y2); \ - x1 = vec_mergel(y0, y2); \ - x2 = vec_mergeh(y1, y3); \ - x3 = vec_mergel(y1, y3); \ - } -# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}) -# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) - -/* - SSE1 support macros -*/ -#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86)) - -#include -typedef __m128 v4sf; -# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. -# define VZERO() _mm_setzero_ps() -# define VMUL(a,b) _mm_mul_ps(a,b) -# define VADD(a,b) _mm_add_ps(a,b) -# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) -# define VSUB(a,b) _mm_sub_ps(a,b) -# define LD_PS1(p) _mm_set1_ps(p) -# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } -# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } -# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) -# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) -# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0) - -/* - ARM NEON support macros -*/ -#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__)) -# include -typedef float32x4_t v4sf; -# define SIMD_SZ 4 -# define VZERO() vdupq_n_f32(0) -# define VMUL(a,b) vmulq_f32(a,b) -# define VADD(a,b) vaddq_f32(a,b) -# define VMADD(a,b,c) vmlaq_f32(c,a,b) -# define VSUB(a,b) vsubq_f32(a,b) -# define LD_PS1(p) vld1q_dup_f32(&(p)) -# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define VTRANSPOSE4(x0,x1,x2,x3) { \ - float32x4x2_t t0_ = vzipq_f32(x0, x2); \ - float32x4x2_t t1_ = vzipq_f32(x1, x3); \ - float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ - float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \ - x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ - } -// marginally faster version -//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } -# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) -# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) -#else -# if !defined(PFFFT_SIMD_DISABLE) -# warning "building with simd disabled !\n"; -# define PFFFT_SIMD_DISABLE // fallback to scalar code -# endif -#endif - -// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead -#ifdef PFFFT_SIMD_DISABLE -typedef float v4sf; -# define SIMD_SZ 1 -# define VZERO() 0.f -# define VMUL(a,b) ((a)*(b)) -# define VADD(a,b) ((a)+(b)) -# define VMADD(a,b,c) ((a)*(b)+(c)) -# define VSUB(a,b) ((a)-(b)) -# define LD_PS1(p) (p) -# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0) -#endif - -// shortcuts for complex multiplcations -#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } -#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } -#ifndef SVMUL -// multiply a scalar with a vector -#define SVMUL(f,v) VMUL(LD_PS1(f),v) -#endif - -#if !defined(PFFFT_SIMD_DISABLE) -typedef union v4sf_union { - v4sf v; - float f[4]; -} v4sf_union; - -#include - -#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) - -/* detect bugs with the vector support macros */ -void validate_pffft_simd(void) { - float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; - v4sf_union a0, a1, a2, a3, t, u; - memcpy(a0.f, f, 4*sizeof(float)); - memcpy(a1.f, f+4, 4*sizeof(float)); - memcpy(a2.f, f+8, 4*sizeof(float)); - memcpy(a3.f, f+12, 4*sizeof(float)); - - t = a0; u = a1; t.v = VZERO(); - printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); - t.v = VADD(a1.v, a2.v); - printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); - t.v = VMUL(a1.v, a2.v); - printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); - t.v = VMADD(a1.v, a2.v,a0.v); - printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); - - INTERLEAVE2(a1.v,a2.v,t.v,u.v); - printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); - assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); - UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); - printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); - assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); - - t.v=LD_PS1(f[15]); - printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); - assertv4(t, 15, 15, 15, 15); - t.v = VSWAPHL(a1.v, a2.v); - printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); - assertv4(t, 8, 9, 6, 7); - VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); - printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", - a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], - a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); - assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); -} -#else -void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available.. -#endif //!PFFFT_SIMD_DISABLE - -/* SSE and co like 16-bytes aligned pointers */ -#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... -void *pffft_aligned_malloc(size_t nb_bytes) { - void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); - if (!p0) return (void *) 0; - p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); - *((void **) p - 1) = p0; - return p; -} - -void pffft_aligned_free(void *p) { - if (p) free(*((void **) p - 1)); -} - -int pffft_simd_size() { return SIMD_SZ; } - -/* - passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 -*/ -static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { - int k, i; - int l1ido = l1*ido; - if (ido <= 2) { - for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) { - ch[0] = VADD(cc[0], cc[ido+0]); - ch[l1ido] = VSUB(cc[0], cc[ido+0]); - ch[1] = VADD(cc[1], cc[ido+1]); - ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); - } - } else { - for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) { - for (i=0; i 2); - for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { - for (i=0; i 2); - for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { - for (i = 0; i < ido-1; i += 2) { - ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); - ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); - ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); - ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); - tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); - tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); - tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); - tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); - ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); - ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); - cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); - ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); - cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); - ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); - cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); - ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); - cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); - ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); - dr3 = VSUB(cr3, ci4); - dr4 = VADD(cr3, ci4); - di3 = VADD(ci3, cr4); - di4 = VSUB(ci3, cr4); - dr5 = VADD(cr2, ci5); - dr2 = VSUB(cr2, ci5); - di5 = VSUB(ci2, cr5); - di2 = VADD(ci2, cr5); - wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; - wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1]; - VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); - ch_ref(i - 1, 2) = dr2; - ch_ref(i, 2) = di2; - VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); - ch_ref(i - 1, 3) = dr3; - ch_ref(i, 3) = di3; - VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); - ch_ref(i - 1, 4) = dr4; - ch_ref(i, 4) = di4; - VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); - ch_ref(i - 1, 5) = dr5; - ch_ref(i, 5) = di5; - } - } -#undef ch_ref -#undef cc_ref -} - -static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { - static const float minus_one = -1.f; - int i, k, l1ido = l1*ido; - for (k=0; k < l1ido; k += ido) { - v4sf a = cc[k], b = cc[k + l1ido]; - ch[2*k] = VADD(a, b); - ch[2*(k+ido)-1] = VSUB(a, b); - } - if (ido < 2) return; - if (ido != 2) { - for (k=0; k < l1ido; k += ido) { - for (i=2; i 5) { - wa[i1-1] = wa[i-1]; - wa[i1] = wa[i]; - } - } - l1 = l2; - } -} /* cffti1 */ - - -v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { - v4sf *in = (v4sf*)input_readonly; - v4sf *out = (in == work2 ? work1 : work2); - int nf = ifac[1], k1; - int l1 = 1; - int iw = 0; - assert(in != out && work1 != work2); - for (k1=2; k1<=nf+1; k1++) { - int ip = ifac[k1]; - int l2 = ip*l1; - int ido = n / l2; - int idot = ido + ido; - switch (ip) { - case 5: { - int ix2 = iw + idot; - int ix3 = ix2 + idot; - int ix4 = ix3 + idot; - passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); - } break; - case 4: { - int ix2 = iw + idot; - int ix3 = ix2 + idot; - passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign); - } break; - case 2: { - passf2_ps(idot, l1, in, out, &wa[iw], isign); - } break; - case 3: { - int ix2 = iw + idot; - passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign); - } break; - default: - assert(0); - } - l1 = l2; - iw += (ip - 1)*idot; - if (out == work2) { - out = work1; in = work2; - } else { - out = work2; in = work1; - } - } - - return in; /* this is in fact the output .. */ -} - - -struct PFFFT_Setup { - int N; - int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) - int ifac[15]; - pffft_transform_t transform; - v4sf *data; // allocated room for twiddle coefs - float *e; // points into 'data' , N/4*3 elements - float *twiddle; // points into 'data', N/4 elements -}; - -PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { - PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup)); - int k, m; - /* unfortunately, the fft size must be a multiple of 16 for complex FFTs - and 32 for real FFTs -- a lot of stuff would need to be rewritten to - handle other cases (or maybe just switch to a scalar fft, I don't know..) */ - if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } - if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } - //assert((N % 32) == 0); - s->N = N; - s->transform = transform; - /* nb of complex simd vectors */ - s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; - s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf)); - s->e = (float*)s->data; - s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); - - if (transform == PFFFT_REAL) { - for (k=0; k < s->Ncvec; ++k) { - int i = k/SIMD_SZ; - int j = k%SIMD_SZ; - for (m=0; m < SIMD_SZ-1; ++m) { - float A = -2*M_PI*(m+1)*k / N; - s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A); - s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A); - } - } - rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); - } else { - for (k=0; k < s->Ncvec; ++k) { - int i = k/SIMD_SZ; - int j = k%SIMD_SZ; - for (m=0; m < SIMD_SZ-1; ++m) { - float A = -2*M_PI*(m+1)*k / N; - s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A); - s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A); - } - } - cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); - } - - /* check that N is decomposable with allowed prime factors */ - for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } - if (m != N/SIMD_SZ) { - pffft_destroy_setup(s); s = 0; - } - - return s; -} - - -void pffft_destroy_setup(PFFFT_Setup *s) { - pffft_aligned_free(s->data); - free(s); -} - -#if !defined(PFFFT_SIMD_DISABLE) - -/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ -static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { - v4sf g0, g1; - int k; - INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; - - *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] - for (k=1; k < N; ++k) { - v4sf h0, h1; - INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; - *--out = VSWAPHL(g1, h0); - *--out = VSWAPHL(h0, h1); - g1 = h1; - } - *--out = VSWAPHL(g1, g0); -} - -static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { - v4sf g0, g1, h0, h1; - int k; - g0 = g1 = in[0]; ++in; - for (k=1; k < N; ++k) { - h0 = *in++; h1 = *in++; - g1 = VSWAPHL(g1, h0); - h0 = VSWAPHL(h0, h1); - UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; - g1 = h1; - } - h0 = *in++; h1 = g0; - g1 = VSWAPHL(g1, h0); - h0 = VSWAPHL(h0, h1); - UNINTERLEAVE2(h0, g1, out[0], out[1]); -} - -void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { - int k, N = setup->N, Ncvec = setup->Ncvec; - const v4sf *vin = (const v4sf*)in; - v4sf *vout = (v4sf*)out; - assert(in != out); - if (setup->transform == PFFFT_REAL) { - int dk = N/32; - if (direction == PFFFT_FORWARD) { - for (k=0; k < dk; ++k) { - INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); - INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); - } - reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2)); - reversed_copy(dk, vin+6, 8, (v4sf*)(out + N)); - } else { - for (k=0; k < dk; ++k) { - UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); - UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); - } - unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8); - unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8); - } - } else { - if (direction == PFFFT_FORWARD) { - for (k=0; k < Ncvec; ++k) { - int kk = (k/4) + (k%4)*(Ncvec/4); - INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); - } - } else { - for (k=0; k < Ncvec; ++k) { - int kk = (k/4) + (k%4)*(Ncvec/4); - UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); - } - } - } -} - -void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - assert(in != out); - for (k=0; k < dk; ++k) { - r0 = in[8*k+0]; i0 = in[8*k+1]; - r1 = in[8*k+2]; i1 = in[8*k+3]; - r2 = in[8*k+4]; i2 = in[8*k+5]; - r3 = in[8*k+6]; i3 = in[8*k+7]; - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]); - VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]); - VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]); - - sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); - si0 = VADD(i0,i2); di0 = VSUB(i0, i2); - si1 = VADD(i1,i3); di1 = VSUB(i1, i3); - - /* - transformation for each column is: - - [1 1 1 1 0 0 0 0] [r0] - [1 0 -1 0 0 -1 0 1] [r1] - [1 -1 1 -1 0 0 0 0] [r2] - [1 0 -1 0 0 1 0 -1] [r3] - [0 0 0 0 1 1 1 1] * [i0] - [0 1 0 -1 1 0 -1 0] [i1] - [0 0 0 0 1 -1 1 -1] [i2] - [0 -1 0 1 1 0 -1 0] [i3] - */ - - r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); - r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); - r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); - r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); - - *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; - *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; - } -} - -void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - assert(in != out); - for (k=0; k < dk; ++k) { - r0 = in[8*k+0]; i0 = in[8*k+1]; - r1 = in[8*k+2]; i1 = in[8*k+3]; - r2 = in[8*k+4]; i2 = in[8*k+5]; - r3 = in[8*k+6]; i3 = in[8*k+7]; - - sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); - si0 = VADD(i0,i2); di0 = VSUB(i0, i2); - si1 = VADD(i1,i3); di1 = VSUB(i1, i3); - - r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); - r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); - r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); - r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); - - VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]); - VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]); - VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]); - - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - - *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; - *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; - } -} - - -static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, - const v4sf *e, v4sf *out) { - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - r0 = *in0; i0 = *in1; - r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - - /* - transformation for each column is: - - [1 1 1 1 0 0 0 0] [r0] - [1 0 -1 0 0 -1 0 1] [r1] - [1 0 -1 0 0 1 0 -1] [r2] - [1 -1 1 -1 0 0 0 0] [r3] - [0 0 0 0 1 1 1 1] * [i0] - [0 -1 0 1 -1 0 1 0] [i1] - [0 -1 0 1 1 0 -1 0] [i2] - [0 0 0 0 -1 1 -1 1] [i3] - */ - - //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; - //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; - - VCPLXMUL(r1,i1,e[0],e[1]); - VCPLXMUL(r2,i2,e[2],e[3]); - VCPLXMUL(r3,i3,e[4],e[5]); - - //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; - //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; - - sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); - si0 = VADD(i0,i2); di0 = VSUB(i0,i2); - si1 = VADD(i1,i3); di1 = VSUB(i3,i1); - - r0 = VADD(sr0, sr1); - r3 = VSUB(sr0, sr1); - i0 = VADD(si0, si1); - i3 = VSUB(si1, si0); - r1 = VADD(dr0, di1); - r2 = VSUB(dr0, di1); - i1 = VSUB(dr1, di0); - i2 = VADD(dr1, di0); - - *out++ = r0; - *out++ = i0; - *out++ = r1; - *out++ = i1; - *out++ = r2; - *out++ = i2; - *out++ = r3; - *out++ = i3; - -} - -static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks - /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ - - v4sf_union cr, ci, *uout = (v4sf_union*)out; - v4sf save = in[7], zero=VZERO(); - float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3; - static const float s = M_SQRT2/2; - - cr.v = in[0]; ci.v = in[Ncvec*2-1]; - assert(in != out); - pffft_real_finalize_4x4(&zero, &zero, in+1, e, out); - - /* - [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] - - [Xr(1)] ] [1 1 1 1 0 0 0 0] - [Xr(N/4) ] [0 0 0 0 1 s 0 -s] - [Xr(N/2) ] [1 0 -1 0 0 0 0 0] - [Xr(3N/4)] [0 0 0 0 1 -s 0 s] - [Xi(1) ] [1 -1 1 -1 0 0 0 0] - [Xi(N/4) ] [0 0 0 0 0 -s -1 -s] - [Xi(N/2) ] [0 -1 0 1 0 0 0 0] - [Xi(3N/4)] [0 0 0 0 0 -s 1 -s] - */ - - xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0; - xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0; - xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2; - xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2; - xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1; - xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1; - xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3; - xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; - - for (k=1; k < dk; ++k) { - v4sf save_next = in[8*k+7]; - pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, - e + k*6, out + k*8); - save = save_next; - } - -} - -static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, - const v4sf *e, v4sf *out, int first) { - v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; - /* - transformation for each column is: - - [1 1 1 1 0 0 0 0] [r0] - [1 0 0 -1 0 -1 -1 0] [r1] - [1 -1 -1 1 0 0 0 0] [r2] - [1 0 0 -1 0 1 1 0] [r3] - [0 0 0 0 1 -1 1 -1] * [i0] - [0 -1 1 0 1 0 0 1] [i1] - [0 0 0 0 1 1 -1 -1] [i2] - [0 1 -1 0 1 0 0 1] [i3] - */ - - v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); - v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); - v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); - v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); - - r0 = VADD(sr0, sr1); - r2 = VSUB(sr0, sr1); - r1 = VSUB(dr0, si1); - r3 = VADD(dr0, si1); - i0 = VSUB(di0, di1); - i2 = VADD(di0, di1); - i1 = VSUB(si0, dr1); - i3 = VADD(si0, dr1); - - VCPLXMULCONJ(r1,i1,e[0],e[1]); - VCPLXMULCONJ(r2,i2,e[2],e[3]); - VCPLXMULCONJ(r3,i3,e[4],e[5]); - - VTRANSPOSE4(r0,r1,r2,r3); - VTRANSPOSE4(i0,i1,i2,i3); - - if (!first) { - *out++ = r0; - *out++ = i0; - } - *out++ = r1; - *out++ = i1; - *out++ = r2; - *out++ = i2; - *out++ = r3; - *out++ = i3; -} - -static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { - int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks - /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ - - v4sf_union Xr, Xi, *uout = (v4sf_union*)out; - float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3; - static const float s = M_SQRT2; - assert(in != out); - for (k=0; k < 4; ++k) { - Xr.f[k] = ((float*)in)[8*k]; - Xi.f[k] = ((float*)in)[8*k+4]; - } - - pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values - - /* - [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] - - [cr0] [1 0 2 0 1 0 0 0] - [cr1] [1 0 0 0 -1 0 -2 0] - [cr2] [1 0 -2 0 1 0 0 0] - [cr3] [1 0 0 0 -1 0 2 0] - [ci0] [0 2 0 2 0 0 0 0] - [ci1] [0 s 0 -s 0 -s 0 -s] - [ci2] [0 0 0 0 0 -2 0 2] - [ci3] [0 -s 0 s 0 -s 0 -s] - */ - for (k=1; k < dk; ++k) { - pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0); - } - - cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0; - cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1; - cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2; - cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3; - ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0; - ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1; - ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2; - ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3; -} - - -void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, - pffft_direction_t direction, int ordered) { - int k, Ncvec = setup->Ncvec; - int nf_odd = (setup->ifac[1] & 1); - - // temporary buffer is allocated on the stack if the scratch pointer is NULL - int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); - VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); - - const v4sf *vinput = (const v4sf*)finput; - v4sf *voutput = (v4sf*)foutput; - v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; - int ib = (nf_odd ^ ordered ? 1 : 0); - - assert(VALIGNED(finput) && VALIGNED(foutput)); - - //assert(finput != foutput); - if (direction == PFFFT_FORWARD) { - ib = !ib; - if (setup->transform == PFFFT_REAL) { - ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); - } else { - v4sf *tmp = buff[ib]; - for (k=0; k < Ncvec; ++k) { - UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); - } - ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], - setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); - pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); - } - if (ordered) { - pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); - } else ib = !ib; - } else { - if (vinput == buff[ib]) { - ib = !ib; // may happen when finput == foutput - } - if (ordered) { - pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); - vinput = buff[ib]; ib = !ib; - } - if (setup->transform == PFFFT_REAL) { - pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); - ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - } else { - pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); - ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], - setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); - for (k=0; k < Ncvec; ++k) { - INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); - } - } - } - - if (buff[ib] != voutput) { - /* extra copy required -- this situation should only happen when finput == foutput */ - assert(finput==foutput); - for (k=0; k < Ncvec; ++k) { - v4sf a = buff[ib][2*k], b = buff[ib][2*k+1]; - voutput[2*k] = a; voutput[2*k+1] = b; - } - ib = !ib; - } - assert(buff[ib] == voutput); -} - -void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { - int Ncvec = s->Ncvec; - const v4sf * RESTRICT va = (const v4sf*)a; - const v4sf * RESTRICT vb = (const v4sf*)b; - v4sf * RESTRICT vab = (v4sf*)ab; - -#ifdef __arm__ - __builtin_prefetch(va); - __builtin_prefetch(vb); - __builtin_prefetch(vab); - __builtin_prefetch(va+2); - __builtin_prefetch(vb+2); - __builtin_prefetch(vab+2); - __builtin_prefetch(va+4); - __builtin_prefetch(vb+4); - __builtin_prefetch(vab+4); - __builtin_prefetch(va+6); - __builtin_prefetch(vb+6); - __builtin_prefetch(vab+6); -# ifndef __clang__ -# define ZCONVOLVE_USING_INLINE_NEON_ASM -# endif -#endif - - float ar, ai, br, bi, abr, abi; -#ifndef ZCONVOLVE_USING_INLINE_ASM - v4sf vscal = LD_PS1(scaling); - int i; -#endif - - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); - ar = ((v4sf_union*)va)[0].f[0]; - ai = ((v4sf_union*)va)[1].f[0]; - br = ((v4sf_union*)vb)[0].f[0]; - bi = ((v4sf_union*)vb)[1].f[0]; - abr = ((v4sf_union*)vab)[0].f[0]; - abi = ((v4sf_union*)vab)[1].f[0]; - -#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc - const float *a_ = a, *b_ = b; float *ab_ = ab; - int N = Ncvec; - asm volatile("mov r8, %2 \n" - "vdup.f32 q15, %4 \n" - "1: \n" - "pld [%0,#64] \n" - "pld [%1,#64] \n" - "pld [%2,#64] \n" - "pld [%0,#96] \n" - "pld [%1,#96] \n" - "pld [%2,#96] \n" - "vld1.f32 {q0,q1}, [%0,:128]! \n" - "vld1.f32 {q4,q5}, [%1,:128]! \n" - "vld1.f32 {q2,q3}, [%0,:128]! \n" - "vld1.f32 {q6,q7}, [%1,:128]! \n" - "vld1.f32 {q8,q9}, [r8,:128]! \n" - - "vmul.f32 q10, q0, q4 \n" - "vmul.f32 q11, q0, q5 \n" - "vmul.f32 q12, q2, q6 \n" - "vmul.f32 q13, q2, q7 \n" - "vmls.f32 q10, q1, q5 \n" - "vmla.f32 q11, q1, q4 \n" - "vld1.f32 {q0,q1}, [r8,:128]! \n" - "vmls.f32 q12, q3, q7 \n" - "vmla.f32 q13, q3, q6 \n" - "vmla.f32 q8, q10, q15 \n" - "vmla.f32 q9, q11, q15 \n" - "vmla.f32 q0, q12, q15 \n" - "vmla.f32 q1, q13, q15 \n" - "vst1.f32 {q8,q9},[%2,:128]! \n" - "vst1.f32 {q0,q1},[%2,:128]! \n" - "subs %3, #2 \n" - "bne 1b \n" - : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); -#else // default routine, works fine for non-arm cpus with current compilers - for (i=0; i < Ncvec; i += 2) { - v4sf ar, ai, br, bi; - ar = va[2*i+0]; ai = va[2*i+1]; - br = vb[2*i+0]; bi = vb[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]); - vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]); - ar = va[2*i+2]; ai = va[2*i+3]; - br = vb[2*i+2]; bi = vb[2*i+3]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]); - vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]); - } -#endif - if (s->transform == PFFFT_REAL) { - ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling; - ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling; - } -} - - -#else // defined(PFFFT_SIMD_DISABLE) - -// standard routine using scalar floats, without SIMD stuff. - -#define pffft_zreorder_nosimd pffft_zreorder -void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { - int k, N = setup->N; - if (setup->transform == PFFFT_COMPLEX) { - for (k=0; k < 2*N; ++k) out[k] = in[k]; - return; - } - else if (direction == PFFFT_FORWARD) { - float x_N = in[N-1]; - for (k=N-1; k > 1; --k) out[k] = in[k-1]; - out[0] = in[0]; - out[1] = x_N; - } else { - float x_N = in[1]; - for (k=1; k < N-1; ++k) out[k] = in[k+1]; - out[0] = in[0]; - out[N-1] = x_N; - } -} - -#define pffft_transform_internal_nosimd pffft_transform_internal -void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, - pffft_direction_t direction, int ordered) { - int Ncvec = setup->Ncvec; - int nf_odd = (setup->ifac[1] & 1); - - // temporary buffer is allocated on the stack if the scratch pointer is NULL - int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); - VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); - float *buff[2]; - int ib; - if (scratch == 0) scratch = scratch_on_stack; - buff[0] = output; buff[1] = scratch; - - if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered. - ib = (nf_odd ^ ordered ? 1 : 0); - - if (direction == PFFFT_FORWARD) { - if (setup->transform == PFFFT_REAL) { - ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - } else { - ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); - } - if (ordered) { - pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; - } - } else { - if (input == buff[ib]) { - ib = !ib; // may happen when finput == foutput - } - if (ordered) { - pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); - input = buff[!ib]; - } - if (setup->transform == PFFFT_REAL) { - ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); - } else { - ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], - setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); - } - } - if (buff[ib] != output) { - int k; - // extra copy required -- this situation should happens only when finput == foutput - assert(input==output); - for (k=0; k < Ncvec; ++k) { - float a = buff[ib][2*k], b = buff[ib][2*k+1]; - output[2*k] = a; output[2*k+1] = b; - } - ib = !ib; - } - assert(buff[ib] == output); -} - -#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate -void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, - float *ab, float scaling) { - int i, Ncvec = s->Ncvec; - - if (s->transform == PFFFT_REAL) { - // take care of the fftpack ordering - ab[0] += a[0]*b[0]*scaling; - ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling; - ++ab; ++a; ++b; --Ncvec; - } - for (i=0; i < Ncvec; ++i) { - float ar, ai, br, bi; - ar = a[2*i+0]; ai = a[2*i+1]; - br = b[2*i+0]; bi = b[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - ab[2*i+0] += ar*scaling; - ab[2*i+1] += ai*scaling; - } -} - -#endif // defined(PFFFT_SIMD_DISABLE) - -void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { - pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0); -} - -void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { - pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1); -} diff --git a/hvcc/generators/ir2c/static/pffft.h b/hvcc/generators/ir2c/static/pffft.h deleted file mode 100644 index 5db3d113..00000000 --- a/hvcc/generators/ir2c/static/pffft.h +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) - - Based on original fortran 77 code from FFTPACKv4 from NETLIB, - authored by Dr Paul Swarztrauber of NCAR, in 1985. - - As confirmed by the NCAR fftpack software curators, the following - FFTPACKv5 license applies to FFTPACKv4 sources. My changes are - released under the same terms. - - FFTPACK license: - - http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html - - Copyright (c) 2004 the University Corporation for Atmospheric - Research ("UCAR"). All rights reserved. Developed by NCAR's - Computational and Information Systems Laboratory, UCAR, - www.cisl.ucar.edu. - - Redistribution and use of the Software in source and binary forms, - with or without modification, is permitted provided that the - following conditions are met: - - - Neither the names of NCAR's Computational and Information Systems - Laboratory, the University Corporation for Atmospheric Research, - nor the names of its sponsors or contributors may be used to - endorse or promote products derived from this Software without - specific prior written permission. - - - Redistributions of source code must retain the above copyright - notices, this list of conditions, and the disclaimer below. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions, and the disclaimer below in the - documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. -*/ - -/* - PFFFT : a Pretty Fast FFT. - - This is basically an adaptation of the single precision fftpack - (v4) as found on netlib taking advantage of SIMD instruction found - on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). - - For architectures where no SIMD instruction is available, the code - falls back to a scalar version. - - Restrictions: - - - 1D transforms only, with 32-bit single precision. - - - supports only transforms for inputs of length N of the form - N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, - 144, 160, etc are all acceptable lengths). Performance is best for - 128<=N<=8192. - - - all (float*) pointers in the functions below are expected to - have an "simd-compatible" alignment, that is 16 bytes on x86 and - powerpc CPUs. - - You can allocate such buffers with the functions - pffft_aligned_malloc / pffft_aligned_free (or with stuff like - posix_memalign..) - -*/ - -#ifndef PFFFT_H -#define PFFFT_H - -#include // for size_t - -#ifdef __cplusplus -extern "C" { -#endif - - /* opaque struct holding internal stuff (precomputed twiddle factors) - this struct can be shared by many threads as it contains only - read-only data. - */ - typedef struct PFFFT_Setup PFFFT_Setup; - - /* direction of the transform */ - typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; - - /* type of transform */ - typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; - - /* - prepare for performing transforms of size N -- the returned - PFFFT_Setup structure is read-only so it can safely be shared by - multiple concurrent threads. - */ - PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); - void pffft_destroy_setup(PFFFT_Setup *); - /* - Perform a Fourier transform , The z-domain data is stored in the - most efficient order for transforming it back, or using it for - convolution. If you need to have its content sorted in the - "usual" way, that is as an array of interleaved complex numbers, - either use pffft_transform_ordered , or call pffft_zreorder after - the forward fft, and before the backward fft. - - Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. - Typically you will want to scale the backward transform by 1/N. - - The 'work' pointer should point to an area of N (2*N for complex - fft) floats, properly aligned. If 'work' is NULL, then stack will - be used instead (this is probably the best strategy for small - FFTs, say for N < 16384). - - input and output may alias. - */ - void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); - - /* - Similar to pffft_transform, but makes sure that the output is - ordered as expected (interleaved complex numbers). This is - similar to calling pffft_transform and then pffft_zreorder. - - input and output may alias. - */ - void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); - - /* - call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., - PFFFT_FORWARD) if you want to have the frequency components in - the correct "canonical" order, as interleaved complex numbers. - - (for real transforms, both 0-frequency and half frequency - components, which are real, are assembled in the first entry as - F(0)+i*F(n/2+1). Note that the original fftpack did place - F(n/2+1) at the end of the arrays). - - input and output should not alias. - */ - void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); - - /* - Perform a multiplication of the frequency components of dft_a and - dft_b and accumulate them into dft_ab. The arrays should have - been obtained with pffft_transform(.., PFFFT_FORWARD) and should - *not* have been reordered with pffft_zreorder (otherwise just - perform the operation yourself as the dft coefs are stored as - interleaved complex numbers). - - the operation performed is: dft_ab += (dft_a * fdt_b)*scaling - - The dft_a, dft_b and dft_ab pointers may alias. - */ - void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); - - /* - the float buffers must have the correct alignment (16-byte boundary - on intel and powerpc). This function may be used to obtain such - correctly aligned buffers. - */ - void *pffft_aligned_malloc(size_t nb_bytes); - void pffft_aligned_free(void *); - - /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ - int pffft_simd_size(void); - -#ifdef __cplusplus -} -#endif - -#endif // PFFFT_H diff --git a/hvcc/generators/ir2c/templates/Heavy_NAME.cpp b/hvcc/generators/ir2c/templates/Heavy_NAME.cpp index fa786935..a16f226d 100644 --- a/hvcc/generators/ir2c/templates/Heavy_NAME.cpp +++ b/hvcc/generators/ir2c/templates/Heavy_NAME.cpp @@ -189,8 +189,6 @@ int Heavy_{{name}}::process(float **inputBuffers, float **outputBuffers, int n) {%- if nodsp is sameas false %} const int n4 = n & ~HV_N_SIMD_MASK; // ensure that the block size is a multiple of HV_N_SIMD - sendFloatToReceiver(0xB5B01859, static_cast(n4)); // send to __hv_blocksize - // temporary signal vars {%- if signal.numTemporaryBuffers.float > 0 %} hv_bufferf_t {% for i in range(signal.numTemporaryBuffers.float) %}Bf{{i}}{% if not loop.last %}, {%endif%}{% endfor %}; diff --git a/hvcc/interpreters/pd2hv/PdGraph.py b/hvcc/interpreters/pd2hv/PdGraph.py index 1bb17540..f993c26e 100644 --- a/hvcc/interpreters/pd2hv/PdGraph.py +++ b/hvcc/interpreters/pd2hv/PdGraph.py @@ -55,6 +55,9 @@ def __init__( # only used is this graph is actually a subpatch self.subpatch_name: Optional[str] = None + # the block size of this graph (used for rfft window size) + self.block_size: Optional[int] = None + # TODO(dromer) these are virtual attributes that are only instantiated with internal representation self._PdGraph__connections: List[Connection] = [] self._PdGraph__pd_path: str = "" @@ -227,6 +230,7 @@ def to_hv(self, export_args: bool = False) -> Dict: assert all(a is not None for a in self.hv_args), "Graph is missing a @hv_arg." return { "type": "graph", + "block_size": self.block_size, "imports": [], "args": self.hv_args if export_args else [], "objects": {o.obj_id: o.to_hv() for o in self.__objs}, diff --git a/hvcc/interpreters/pd2hv/PdParser.py b/hvcc/interpreters/pd2hv/PdParser.py index e0e6d923..d9f8f46f 100644 --- a/hvcc/interpreters/pd2hv/PdParser.py +++ b/hvcc/interpreters/pd2hv/PdParser.py @@ -356,6 +356,10 @@ def graph_from_canvas( continue if obj_type in ('block~',): + if obj_type == 'block~': + # grab the block size for this graph + g.block_size = obj_args[0] + # we ignore the object and continue g.add_warning( f"This graph contains a {obj_type} object that is ignored.", diff --git a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd index 2079e285..e2ce8dde 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd @@ -1,4 +1,4 @@ -#N canvas 796 489 436 234 12; +#N canvas 629 481 436 234 12; #X obj 25 15 inlet~; #N canvas 0 22 450 300 @hv_obj 0; #X obj 146 77 inlet; @@ -7,24 +7,21 @@ #X text 80 22 measured after initialisation; #X obj 25 184 outlet~; #X obj 211 48 loadbang; -#X obj 65 111 r __hv_blocksize; #X obj 211 185 outlet~; #X msg 211 104 table \$1 size; #X obj 71 62 table rfft_\$0; #X obj 211 76 symbol rfft_\$0; -#N canvas 0 22 450 300 @hv_obj 0; +#N canvas 0 22 450 300 @hv_obj 1; #X obj 55 136 outlet~; #X obj 55 75 inlet~; #X obj 190 74 inlet; -#X obj 122 78 inlet; #X obj 167 137 outlet~; #X restore 25 155 pd @hv_obj __rfft~f rfft_\$0; #X text 80 11 NOTE(dreamer): ensure that the table size is; -#X connect 0 0 10 0; -#X connect 1 0 10 2; -#X connect 4 0 9 0; -#X connect 5 0 10 1; -#X connect 7 0 1 0; -#X connect 9 0 7 0; -#X connect 10 0 3 0; -#X connect 10 1 6 0; +#X connect 0 0 9 0; +#X connect 1 0 9 1; +#X connect 4 0 8 0; +#X connect 6 0 1 0; +#X connect 8 0 6 0; +#X connect 9 0 3 0; +#X connect 9 1 5 0; diff --git a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd index 437506f8..2903aeef 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd @@ -1,15 +1,13 @@ #N canvas 1038 308 486 278 12; #X obj 24 59 inlet~; -#X text 22 24 measured after initialisation; +#X text 23 24 measured after initialisation; #X text 22 14 NOTE(mhroth): ensure that the table size is; #X obj 24 227 outlet~; -#X obj 157 121 r __hv_blocksize; -#X obj 91 59 inlet~; -#N canvas 0 22 450 300 @hv_obj 0; +#X obj 123 59 inlet~; +#N canvas 0 22 450 300 @hv_obj 1; #X obj 55 136 outlet~; #X obj 55 75 inlet~; -#X obj 247 75 inlet; -#X obj 179 75 inlet; +#X obj 180 75 inlet; #X obj 117 75 inlet~; #X restore 24 197 pd @hv_obj __rifft~f rifft_\$0; #N canvas 0 22 450 300 @hv_obj 0; @@ -18,13 +16,12 @@ #X restore 287 144 pd @hv_obj system; #X obj 287 63 loadbang; #X msg 287 119 table \$1 size; -#X obj 150 77 table rifft_\$0; +#X obj 150 116 table rifft_\$0; #X obj 287 91 symbol rifft_\$0; -#X connect 0 0 6 0; -#X connect 4 0 6 2; -#X connect 5 0 6 1; -#X connect 6 0 3 0; -#X connect 7 0 6 3; -#X connect 8 0 11 0; -#X connect 9 0 7 0; -#X connect 11 0 9 0; +#X connect 0 0 5 0; +#X connect 4 0 5 1; +#X connect 5 0 3 0; +#X connect 6 0 5 2; +#X connect 7 0 10 0; +#X connect 8 0 6 0; +#X connect 10 0 8 0; From 8aa4f6455b0cb1475206fb58880fe94b200fc42f Mon Sep 17 00:00:00 2001 From: dreamer Date: Sat, 27 Dec 2025 04:25:11 +0100 Subject: [PATCH 10/12] split rfft/rifft C impl; tables in init --- hvcc/core/hv2ir/HIrRFFT.py | 9 +- hvcc/core/hv2ir/HeavyParser.py | 2 +- hvcc/core/json/heavy.ir.json | 24 +---- hvcc/generators/ir2c/SignalRFFT.py | 42 +++++++- hvcc/generators/ir2c/ir2c.py | 4 +- hvcc/generators/ir2c/static/HvSignalRFFT.c | 113 +++++---------------- hvcc/generators/ir2c/static/HvSignalRFFT.h | 24 +++-- hvcc/interpreters/pd2hv/libs/pd/rfft~.pd | 31 ++---- hvcc/interpreters/pd2hv/libs/pd/rifft~.pd | 33 ++---- 9 files changed, 105 insertions(+), 177 deletions(-) diff --git a/hvcc/core/hv2ir/HIrRFFT.py b/hvcc/core/hv2ir/HIrRFFT.py index 8bfdd716..1580e2bb 100644 --- a/hvcc/core/hv2ir/HIrRFFT.py +++ b/hvcc/core/hv2ir/HIrRFFT.py @@ -37,13 +37,6 @@ def __init__( def reduce(self) -> Optional[tuple]: if self.graph is not None: self.args["block_size"] = self.graph.args["block_size"] - table_obj = self.graph.resolve_object_for_name( - self.args["table"], - ["table", "__table"]) - if table_obj is not None: - self.args["table_id"] = table_obj.id - return ({self}, []) - else: - self.add_error(f"Cannot find table named \"{self.args['table']}\" for object {self}.") + return ({self}, []) return None diff --git a/hvcc/core/hv2ir/HeavyParser.py b/hvcc/core/hv2ir/HeavyParser.py index 14901afd..f4b3bd54 100644 --- a/hvcc/core/hv2ir/HeavyParser.py +++ b/hvcc/core/hv2ir/HeavyParser.py @@ -115,7 +115,7 @@ def graph_from_object( """ # resolve default graph arguments graph_args = graph_args or {} - if(json_heavy["block_size"] is not None): + if json_heavy["block_size"] is not None: graph_args["block_size"] = int(json_heavy["block_size"]) for a in json_heavy["args"]: if a["name"] not in graph_args: diff --git a/hvcc/core/json/heavy.ir.json b/hvcc/core/json/heavy.ir.json index f5882640..5596ba2d 100644 --- a/hvcc/core/json/heavy.ir.json +++ b/hvcc/core/json/heavy.ir.json @@ -2273,9 +2273,7 @@ }, "__rfft~f": { "inlets": [ - "~f>", - "-->", - "-->" + "~f>" ], "ir": { "control": false, @@ -2286,13 +2284,7 @@ "~f>", "~f>" ], - "args": [{ - "name": "table", - "value_type": "string", - "description": "", - "default": "", - "required": true - }], + "args": [], "perf": { "avx": 0, "sse": 0 @@ -2301,9 +2293,7 @@ "__rifft~f": { "inlets": [ "~f>", - "~f>", - "-->", - "-->" + "~f>" ], "ir": { "control": false, @@ -2313,13 +2303,7 @@ "outlets": [ "~f>" ], - "args": [{ - "name": "table", - "value_type": "string", - "description": "", - "default": "", - "required": true - }], + "args": [], "perf": { "avx": 0, "sse": 0 diff --git a/hvcc/generators/ir2c/SignalRFFT.py b/hvcc/generators/ir2c/SignalRFFT.py index 49f8a100..49674b8a 100644 --- a/hvcc/generators/ir2c/SignalRFFT.py +++ b/hvcc/generators/ir2c/SignalRFFT.py @@ -37,9 +37,8 @@ def get_C_file_set(cls) -> set: @classmethod def get_C_init(cls, obj_type: str, obj_id: str, args: Dict) -> List[str]: return [ - "sRFFT_init(&sRFFT_{0}, &hTable_{1}, {2});".format( + "sRFFT_init(&sRFFT_{0}, {1});".format( obj_id, - args["table_id"], args["block_size"]) ] @@ -62,9 +61,44 @@ def get_C_process(cls, process_dict: IRSignalList, obj_type: str, obj_id: str, a cls._c_buffer(process_dict.outputBuffers[1]) ) ] - elif obj_type == "__rifft~f": + else: + raise Exception + + +class SignalRIFFT(HeavyObject): + + c_struct = "SignalRIFFT" + preamble = "sRIFFT" + + @classmethod + def get_C_header_set(cls) -> set: + return {"HvSignalRFFT.h"} + + @classmethod + def get_C_file_set(cls) -> set: + return {"HvSignalRFFT.h", "HvSignalRFFT.c"} + + @classmethod + def get_C_init(cls, obj_type: str, obj_id: str, args: Dict) -> List[str]: + return [ + "sRIFFT_init(&sRIFFT_{0}, {1});".format( + obj_id, + args["block_size"]) + ] + + @classmethod + def get_C_onMessage(cls, obj_type: str, obj_id: str, inlet_index: int, args: Dict) -> List[str]: + return [ + "sRIFFT_onMessage(_c, &Context(_c)->sRIFFT_{0}, {1}, m, NULL);".format( + obj_id, + inlet_index) + ] + + @classmethod + def get_C_process(cls, process_dict: IRSignalList, obj_type: str, obj_id: str, args: Dict) -> List[str]: + if obj_type == "__rifft~f": return [ - "__hv_rifft_f(&sRFFT_{0}, VIf({1}), VIf({2}), VOf({3}));".format( + "__hv_rifft_f(&sRIFFT_{0}, VIf({1}), VIf({2}), VOf({3}));".format( process_dict.id, cls._c_buffer(process_dict.inputBuffers[0]), cls._c_buffer(process_dict.inputBuffers[1]), diff --git a/hvcc/generators/ir2c/ir2c.py b/hvcc/generators/ir2c/ir2c.py index d9295352..920a511c 100644 --- a/hvcc/generators/ir2c/ir2c.py +++ b/hvcc/generators/ir2c/ir2c.py @@ -59,7 +59,7 @@ from hvcc.generators.ir2c.SignalLorenz import SignalLorenz from hvcc.generators.ir2c.SignalMath import SignalMath from hvcc.generators.ir2c.SignalPhasor import SignalPhasor -from hvcc.generators.ir2c.SignalRFFT import SignalRFFT +from hvcc.generators.ir2c.SignalRFFT import SignalRFFT, SignalRIFFT from hvcc.generators.ir2c.SignalRPole import SignalRPole from hvcc.generators.ir2c.SignalSample import SignalSample from hvcc.generators.ir2c.SignalSamphold import SignalSamphold @@ -107,7 +107,7 @@ class ir2c: "__phasor~f": SignalPhasor, "__phasor_k~f": SignalPhasor, "__rfft~f": SignalRFFT, - "__rifft~f": SignalRFFT, + "__rifft~f": SignalRIFFT, "__sample~f": SignalSample, "__samphold~f": SignalSamphold, "__slice": ControlSlice, diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.c b/hvcc/generators/ir2c/static/HvSignalRFFT.c index 178442d2..0b4b2e1e 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.c +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.c @@ -16,109 +16,50 @@ #include "HvSignalRFFT.h" -hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size) { - o->table = table; - hv_size_t numBytes = hTable_init(&o->inputs, size); +hv_size_t sRFFT_init(SignalRFFT *o, const int size) { + hv_size_t numBytes = hTable_init(&o->input, size); + numBytes += hTable_init(&o->outputReal, size/2+1); + numBytes += hTable_init(&o->outputImagin, size/2+1); return numBytes; } void sRFFT_free(SignalRFFT *o) { - o->table = NULL; - hTable_free(&o->inputs); + hTable_free(&o->input); + hTable_free(&o->outputReal); + hTable_free(&o->outputImagin); +} + +void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut1) { + // do fft stuff } void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, const HvMessage *m, void *sendMessage) { switch (letIndex) { - case 1: { - if (msg_isHashLike(m,0)) { - HvTable *table = hv_table_get(_c, msg_getHash(m,0)); - if (table != NULL) { - o->table = table; - if (hTable_getSize(&o->inputs) != hTable_getSize(table)) { - hTable_resize(&o->inputs, - (hv_uint32_t) hv_min_ui(hTable_getSize(&o->inputs), hTable_getSize(table))); - } - } - } - break; - } - case 2: { - if (msg_isFloat(m,0)) { - // rfft size should never exceed the coefficient table size - hTable_resize(&o->inputs, (hv_uint32_t) msg_getFloat(m,0)); - } - break; - } default: return; } } - -static inline int wrap(const int i, const int n) { - if (i < 0) return (i+n); - if (i >= n) return (i-n); - return i; +hv_size_t sRIFFT_init(SignalRIFFT *o, const int size) { + hv_size_t numBytes = hTable_init(&o->inputReal, size/2+1); + numBytes += hTable_init(&o->inputImagin, size/2+1); + numBytes += hTable_init(&o->output, size); + return numBytes; } - -void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut1) { - hv_assert(o->table != NULL); - float *const work = hTable_getBuffer(o->table); - hv_assert(work != NULL); - const int n = hTable_getSize(o->table); // length fir filter - hv_assert((n&HV_N_SIMD_MASK) == 0); // n is a multiple of HV_N_SIMD - - float *const inputs = hTable_getBuffer(&o->inputs); - hv_assert(inputs != NULL); - const int m = hTable_getSize(&o->inputs); // length of input buffer. - hv_assert(m >= n); - const int h_orig = hTable_getHead(&o->inputs); - - // float *const bOut = (float *)(hv_alloca(2*n*sizeof(float))); - float *const bOut = (float *)(hv_alloca(sizeof(bIn))); - - // do fft stuff - - // uninterleave result into the output buffers - for (int j = 0; j < n; ++j) { - bOut0[n+j] = bOut[0+2*j]; - bOut1[n+j] = bOut[1+2*j]; - } - - __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer - hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); +void sRIFFT_free(SignalRIFFT *o) { + hTable_free(&o->inputReal); + hTable_free(&o->inputImagin); + hTable_free(&o->output); } +void __hv_rifft_f(SignalRIFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { + // do ifft stuff +} -void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { - hv_assert(o->table != NULL); - float *const work = hTable_getBuffer(o->table); - hv_assert(work != NULL); - const int n = hTable_getSize(o->table); // length fir filter - hv_assert((n&HV_N_SIMD_MASK) == 0); // n is a multiple of HV_N_SIMD - - float *const inputs = hTable_getBuffer(&o->inputs); - hv_assert(inputs != NULL); - const int m = hTable_getSize(&o->inputs); // length of input buffer. - hv_assert(m >= n); - const int h_orig = hTable_getHead(&o->inputs); - - float *bIn00 = &bIn0; - float *bIn10 = &bIn1; - // float *const bIn = (float *)(hv_alloca(2*n*sizeof(float))); - float *const bIn = (float *)(hv_alloca(sizeof(bOut))); - - // interleave the input buffers into the transform buffer - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < n; ++j) { - bIn[0+2*j] = bIn00[n+j]; - bIn[1+2*j] = bIn10[n+j]; - } +void sRIFFT_onMessage(HeavyContextInterface *_c, SignalRIFFT *o, int letIndex, + const HvMessage *m, void *sendMessage) { + switch (letIndex) { + default: return; } - - // do ifft stuff - - // __hv_store_f(inputs+h_orig, bIn); // store the new input to the inputs buffer - hTable_setHead(&o->inputs, wrap(h_orig+HV_N_SIMD, m)); } diff --git a/hvcc/generators/ir2c/static/HvSignalRFFT.h b/hvcc/generators/ir2c/static/HvSignalRFFT.h index f47e5c76..f79b7931 100644 --- a/hvcc/generators/ir2c/static/HvSignalRFFT.h +++ b/hvcc/generators/ir2c/static/HvSignalRFFT.h @@ -25,20 +25,26 @@ extern "C" { typedef struct SignalRFFT { - struct HvTable *table; - struct HvTable inputs; + struct HvTable input; + struct HvTable outputReal; + struct HvTable outputImagin; } SignalRFFT; -hv_size_t sRFFT_init(SignalRFFT *o, struct HvTable *table, const int size); - +hv_size_t sRFFT_init(SignalRFFT *o, const int size); void sRFFT_free(SignalRFFT *o); - -void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, - const HvMessage *m, void *sendMessage); - +void sRFFT_onMessage(HeavyContextInterface *_c, SignalRFFT *o, int letIndex, const HvMessage *m, void *sendMessage); void __hv_rfft_f(SignalRFFT *o, hv_bInf_t bIn, hv_bOutf_t bOut0, hv_bOutf_t bOut1); -void __hv_rifft_f(SignalRFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut); +typedef struct SignalRIFFT { + struct HvTable inputReal; + struct HvTable inputImagin; + struct HvTable output; +} SignalRIFFT; + +hv_size_t sRIFFT_init(SignalRIFFT *o, const int size); +void sRIFFT_free(SignalRIFFT *o); +void sRIFFT_onMessage(HeavyContextInterface *_c, SignalRIFFT *o, int letIndex, const HvMessage *m, void *sendMessage); +void __hv_rifft_f(SignalRIFFT *o, hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut); #ifdef __cplusplus } // extern "C" diff --git a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd index e2ce8dde..4cc96863 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rfft~.pd @@ -1,27 +1,12 @@ -#N canvas 629 481 436 234 12; -#X obj 25 15 inlet~; +#N canvas 532 457 436 234 12; +#X obj 59 59 inlet~; +#X obj 59 120 outlet~; +#X obj 190 121 outlet~; #N canvas 0 22 450 300 @hv_obj 0; -#X obj 146 77 inlet; -#X obj 147 120 outlet; -#X restore 211 129 pd @hv_obj system; -#X text 80 22 measured after initialisation; -#X obj 25 184 outlet~; -#X obj 211 48 loadbang; -#X obj 211 185 outlet~; -#X msg 211 104 table \$1 size; -#X obj 71 62 table rfft_\$0; -#X obj 211 76 symbol rfft_\$0; -#N canvas 0 22 450 300 @hv_obj 1; #X obj 55 136 outlet~; #X obj 55 75 inlet~; -#X obj 190 74 inlet; #X obj 167 137 outlet~; -#X restore 25 155 pd @hv_obj __rfft~f rfft_\$0; -#X text 80 11 NOTE(dreamer): ensure that the table size is; -#X connect 0 0 9 0; -#X connect 1 0 9 1; -#X connect 4 0 8 0; -#X connect 6 0 1 0; -#X connect 8 0 6 0; -#X connect 9 0 3 0; -#X connect 9 1 5 0; +#X restore 59 91 pd @hv_obj __rfft~f; +#X connect 0 0 3 0; +#X connect 3 0 1 0; +#X connect 3 1 2 0; diff --git a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd index 2903aeef..124d2c02 100644 --- a/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd +++ b/hvcc/interpreters/pd2hv/libs/pd/rifft~.pd @@ -1,27 +1,12 @@ -#N canvas 1038 308 486 278 12; -#X obj 24 59 inlet~; -#X text 23 24 measured after initialisation; -#X text 22 14 NOTE(mhroth): ensure that the table size is; -#X obj 24 227 outlet~; -#X obj 123 59 inlet~; -#N canvas 0 22 450 300 @hv_obj 1; +#N canvas 1222 648 486 278 12; +#X obj 74 59 inlet~; +#X obj 74 127 outlet~; +#X obj 211 59 inlet~; +#N canvas 0 22 450 300 @hv_obj 0; #X obj 55 136 outlet~; #X obj 55 75 inlet~; -#X obj 180 75 inlet; #X obj 117 75 inlet~; -#X restore 24 197 pd @hv_obj __rifft~f rifft_\$0; -#N canvas 0 22 450 300 @hv_obj 0; -#X obj 146 77 inlet; -#X obj 147 120 outlet; -#X restore 287 144 pd @hv_obj system; -#X obj 287 63 loadbang; -#X msg 287 119 table \$1 size; -#X obj 150 116 table rifft_\$0; -#X obj 287 91 symbol rifft_\$0; -#X connect 0 0 5 0; -#X connect 4 0 5 1; -#X connect 5 0 3 0; -#X connect 6 0 5 2; -#X connect 7 0 10 0; -#X connect 8 0 6 0; -#X connect 10 0 8 0; +#X restore 74 98 pd @hv_obj __rifft~f; +#X connect 0 0 3 0; +#X connect 2 0 3 1; +#X connect 3 0 1 0; From cc61aa4575e11adff1653ccdde3239d238791d53 Mon Sep 17 00:00:00 2001 From: dreamer Date: Sat, 27 Dec 2025 05:23:11 +0100 Subject: [PATCH 11/12] set default size --- hvcc/core/hv2ir/HeavyParser.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hvcc/core/hv2ir/HeavyParser.py b/hvcc/core/hv2ir/HeavyParser.py index f4b3bd54..5a6b2cc0 100644 --- a/hvcc/core/hv2ir/HeavyParser.py +++ b/hvcc/core/hv2ir/HeavyParser.py @@ -115,8 +115,14 @@ def graph_from_object( """ # resolve default graph arguments graph_args = graph_args or {} - if json_heavy["block_size"] is not None: - graph_args["block_size"] = int(json_heavy["block_size"]) + + # set the block size + try: + if json_heavy["block_size"] is not None: + graph_args["block_size"] = int(json_heavy["block_size"]) + except KeyError: + graph_args["block_size"] = 64 + for a in json_heavy["args"]: if a["name"] not in graph_args: if a["required"]: From 0dd952f1d9a1edbf1d517e1fbacacdc0d17dfc0d Mon Sep 17 00:00:00 2001 From: dreamer Date: Tue, 30 Dec 2025 14:35:00 +0100 Subject: [PATCH 12/12] run standalone signal --- tests/framework/base_signal.py | 1 + tests/pd/signal_rfft/test-rfft.pd | 19 +++++++++++++++++++ tests/test_signal.py | 23 +++++++++++++++-------- 3 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 tests/pd/signal_rfft/test-rfft.pd diff --git a/tests/framework/base_signal.py b/tests/framework/base_signal.py index 4079c894..9e10c7de 100644 --- a/tests/framework/base_signal.py +++ b/tests/framework/base_signal.py @@ -90,6 +90,7 @@ def _test_signal_patch(self, pd_file: str): try: out_dir = self._run_hvcc(pd_path) + assert out_dir is not None except Exception as e: self.fail(str(e)) diff --git a/tests/pd/signal_rfft/test-rfft.pd b/tests/pd/signal_rfft/test-rfft.pd new file mode 100644 index 00000000..3d5e1eb9 --- /dev/null +++ b/tests/pd/signal_rfft/test-rfft.pd @@ -0,0 +1,19 @@ +#N canvas 1039 0 450 300 12; +#N canvas 1216 43 450 300 subtest 1; +#X obj 74 84 rfft~; +#X obj 74 146 rifft~; +#X obj 72 231 outlet~; +#X obj 74 43 inlet~; +#X obj 201 58 block~ 256; +#X obj 74 181 /~ 256; +#X connect 0 0 1 0; +#X connect 0 1 1 1; +#X connect 1 0 5 0; +#X connect 3 0 0 0; +#X connect 5 0 2 0; +#X restore 71 104 pd subtest; +#X obj 45 166 dac~; +#X obj 46 44 osc~ 220; +#X connect 0 0 1 1; +#X connect 2 0 0 0; +#X connect 2 0 1 0; diff --git a/tests/test_signal.py b/tests/test_signal.py index 75bd33be..b078d168 100644 --- a/tests/test_signal.py +++ b/tests/test_signal.py @@ -16,6 +16,7 @@ import argparse import os +import shutil from tests.framework.base_signal import TestPdSignalBase @@ -62,17 +63,23 @@ def main(): action="count") args = parser.parse_args() - out_dir = TestPdSignalPatches._run_hvcc(args.pd_path) + test_patch = TestPdSignalPatches() + test_patch.setUp() + out_dir = test_patch._run_hvcc(args.pd_path) + assert out_dir is not None c_src_dir = os.path.join(out_dir, "c") - c_sources = [os.path.join(c_src_dir, c) for c in os.listdir(c_src_dir) if c.endswith(".c")] + shutil.copy2(os.path.join(test_patch.SCRIPT_DIR, "src/test_signal.c"), c_src_dir) + shutil.copy2(os.path.join(test_patch.SCRIPT_DIR, "src/tinywav/tinywav.h"), c_src_dir) + shutil.copy2(os.path.join(test_patch.SCRIPT_DIR, "src/tinywav/tinywav.c"), c_src_dir) + c_sources = os.listdir(c_src_dir) - wav_path = TestPdSignalPatches.compile_and_run( - out_dir, - c_sources, - args.samplerate, - args.blocksize, - args.numblocks, + wav_path = test_patch.compile_and_run( + source_files=c_sources, + out_dir=out_dir, + sample_rate=args.samplerate, + block_size=args.blocksize, + num_iterations=args.numblocks, flag=args.simd) if args.verbose: