| 1 | /*************************************************************************** |
|---|
| 2 | * MMX routine * |
|---|
| 3 | * Copyright (C) 2005 by Prakash Punnoor * |
|---|
| 4 | * prakash@punnoor.de * |
|---|
| 5 | * * |
|---|
| 6 | * This program is free software; you can redistribute it and/or modify * |
|---|
| 7 | * it under the terms of the GNU Library General Public License as * |
|---|
| 8 | * published by the Free Software Foundation; either version 2 of the * |
|---|
| 9 | * License, or (at your option) any later version. * |
|---|
| 10 | * * |
|---|
| 11 | * This program is distributed in the hope that it will be useful, * |
|---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of * |
|---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
|---|
| 14 | * GNU General Public License for more details. * |
|---|
| 15 | * * |
|---|
| 16 | * You should have received a copy of the GNU Library General Public * |
|---|
| 17 | * License along with this program; if not, write to the * |
|---|
| 18 | * Free Software Foundation, Inc., * |
|---|
| 19 | * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * |
|---|
| 20 | ***************************************************************************/ |
|---|
| 21 | #include "al_siteconfig.h" |
|---|
| 22 | |
|---|
| 23 | #include <AL/al.h> |
|---|
| 24 | #include "al_cpu_caps.h" |
|---|
| 25 | #include "x86_simd_support_prk.h" |
|---|
| 26 | |
|---|
| 27 | /* MMX routine needs 16 */ |
|---|
| 28 | #define SCALING_POWER 16 |
|---|
| 29 | #define SCALING_FACTOR (1 << SCALING_POWER) |
|---|
| 30 | |
|---|
| 31 | void _alFloatMul(ALshort *bpt, ALfloat sa, ALuint len); |
|---|
| 32 | |
|---|
| 33 | void _alFloatMul(ALshort *bpt, ALfloat sa, ALuint len) { |
|---|
| 34 | ALint scaled_sa = sa * SCALING_FACTOR; |
|---|
| 35 | ALint iter; |
|---|
| 36 | |
|---|
| 37 | #ifdef __MMX__ |
|---|
| 38 | if (_alHaveMMX()) { |
|---|
| 39 | union { |
|---|
| 40 | short s[4]; |
|---|
| 41 | v4hi v; |
|---|
| 42 | } ALIGN16(v_sa); |
|---|
| 43 | ALuint samples_main; |
|---|
| 44 | ALuint samples_pre; |
|---|
| 45 | ALuint samples_post; |
|---|
| 46 | v4hi temp; |
|---|
| 47 | |
|---|
| 48 | |
|---|
| 49 | samples_pre = MMX_ALIGN - (aint)bpt % MMX_ALIGN; |
|---|
| 50 | samples_pre /= sizeof(ALshort); |
|---|
| 51 | samples_main = len - samples_pre; |
|---|
| 52 | samples_post = samples_main % 8; |
|---|
| 53 | samples_main = samples_main / 8; |
|---|
| 54 | len = samples_post; |
|---|
| 55 | |
|---|
| 56 | while(samples_pre--) { |
|---|
| 57 | iter = *bpt; |
|---|
| 58 | iter *= scaled_sa; |
|---|
| 59 | iter >>= SCALING_POWER; |
|---|
| 60 | *bpt = iter; |
|---|
| 61 | ++bpt; |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | if (scaled_sa < (1 << 15)) { |
|---|
| 65 | /* we do signed multiplication, so 1 << 15 is the max */ |
|---|
| 66 | v_sa.s[0] = scaled_sa; |
|---|
| 67 | v_sa.s[1] = v_sa.s[0]; |
|---|
| 68 | v_sa.s[2] = scaled_sa; |
|---|
| 69 | v_sa.s[3] = v_sa.s[0]; |
|---|
| 70 | |
|---|
| 71 | while (samples_main--) { |
|---|
| 72 | *(v4hi*)bpt = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa.v); |
|---|
| 73 | bpt += 4; |
|---|
| 74 | *(v4hi*)bpt = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa.v); |
|---|
| 75 | bpt += 4; |
|---|
| 76 | } |
|---|
| 77 | } else { |
|---|
| 78 | /* we lose 1 bit here, but well... */ |
|---|
| 79 | v_sa.s[0] = scaled_sa >> 1; |
|---|
| 80 | v_sa.s[1] = v_sa.s[0]; |
|---|
| 81 | v_sa.s[2] = v_sa.s[0]; |
|---|
| 82 | v_sa.s[3] = v_sa.s[0]; |
|---|
| 83 | |
|---|
| 84 | while (samples_main--) { |
|---|
| 85 | temp = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa.v); |
|---|
| 86 | *(v4hi*)bpt = __builtin_ia32_psllw(temp, 1LL); |
|---|
| 87 | bpt += 4; |
|---|
| 88 | temp = __builtin_ia32_pmulhw(*(v4hi*)bpt, v_sa.v); |
|---|
| 89 | *(v4hi*)bpt = __builtin_ia32_psllw(temp, 1LL); |
|---|
| 90 | bpt += 4; |
|---|
| 91 | } |
|---|
| 92 | } |
|---|
| 93 | __builtin_ia32_emms(); |
|---|
| 94 | } |
|---|
| 95 | #endif /* __MMX__ */ |
|---|
| 96 | |
|---|
| 97 | while(len--) { |
|---|
| 98 | iter = *bpt; |
|---|
| 99 | iter *= scaled_sa; |
|---|
| 100 | iter >>= SCALING_POWER; |
|---|
| 101 | *bpt = iter; |
|---|
| 102 | ++bpt; |
|---|
| 103 | } |
|---|
| 104 | } |
|---|