arith.fuc (2631B)
1/* 2 * Copyright 2014 Martin Peres <martin.peres@free.fr> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the folloing conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * Authors: Martin Peres 23 */ 24 25/****************************************************************************** 26 * arith data segment 27 *****************************************************************************/ 28#ifdef INCLUDE_PROC 29#endif 30 31#ifdef INCLUDE_DATA 32#endif 33 34/****************************************************************************** 35 * arith code segment 36 *****************************************************************************/ 37#ifdef INCLUDE_CODE 38 39// does a 32x32 -> 64 multiplication 40// 41// A * B = A_lo * B_lo 42// + ( A_hi * B_lo ) << 16 43// + ( A_lo * B_hi ) << 16 44// + ( A_hi * B_hi ) << 32 45// 46// $r15 - current 47// $r14 - A 48// $r13 - B 49// $r12 - mul_lo (return) 50// $r11 - mul_hi (return) 51// $r0 - zero 52mulu32_32_64: 53 push $r1 // A_hi 54 push $r2 // B_hi 55 push $r3 // tmp0 56 push $r4 // tmp1 57 58 shr b32 $r1 $r14 16 59 shr b32 $r2 $r13 16 60 61 clear b32 $r12 62 clear b32 $r11 63 64 // A_lo * B_lo 65 mulu $r12 $r14 $r13 66 67 // ( A_hi * B_lo ) << 16 68 mulu $r3 $r1 $r13 // tmp0 = A_hi * B_lo 69 mov b32 $r4 $r3 70 and $r3 0xffff // tmp0 = tmp0_lo 71 shl b32 $r3 16 72 shr b32 $r4 16 // tmp1 = tmp0_hi 73 add b32 $r12 $r3 74 adc b32 $r11 $r4 75 76 // ( A_lo * B_hi ) << 16 77 mulu $r3 $r14 $r2 // tmp0 = A_lo * B_hi 78 mov b32 $r4 $r3 79 and $r3 0xffff // tmp0 = tmp0_lo 80 shl b32 $r3 16 81 shr b32 $r4 16 // tmp1 = tmp0_hi 82 add b32 $r12 $r3 83 adc b32 $r11 $r4 84 85 // ( A_hi * B_hi ) << 32 86 mulu $r3 $r1 $r2 // tmp0 = A_hi * B_hi 87 add b32 $r11 $r3 88 89 pop $r4 90 pop $r3 91 pop $r2 92 pop $r1 93 ret 94#endif