You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			216 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			ArmAsm
		
	
			
		
		
	
	
			216 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			ArmAsm
		
	
| @
 | |
| @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 | |
| @
 | |
| @ Use of this source code is governed by a BSD-style license
 | |
| @ that can be found in the LICENSE file in the root of the source
 | |
| @ tree. An additional intellectual property rights grant can be found
 | |
| @ in the file PATENTS.  All contributing project authors may
 | |
| @ be found in the AUTHORS file in the root of the source tree.
 | |
| @
 | |
| 
 | |
| @ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
 | |
| @ ARM Neon platform. The description header can be found in
 | |
| @ signal_processing_library.h
 | |
| @
 | |
| @ The reference C code is in file downsample_fast.c. Bit-exact.
 | |
| 
 | |
| #include "webrtc/system_wrappers/interface/asm_defines.h"
 | |
| 
 | |
| GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
 | |
| .align  2
 | |
| DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
 | |
|   push {r4-r11}
 | |
| 
 | |
|   cmp r3, #0                                @ data_out_length <= 0?
 | |
|   movle r0, #-1
 | |
|   ble END
 | |
| 
 | |
|   ldrsh r12, [sp, #44]
 | |
|   ldr r5, [sp, #40]                         @ r5: factor
 | |
|   add r4, r12, #1                           @ r4: delay + 1
 | |
|   sub r3, r3, #1                            @ r3: data_out_length - 1
 | |
|   smulbb r3, r5, r3
 | |
|   ldr r8, [sp, #32]                         @ &coefficients[0]
 | |
|   mov r9, r12                               @ Iteration counter for outer loops.
 | |
|   add r3, r4                                @ delay + factor * (out_length-1) +1
 | |
| 
 | |
|   cmp r3, r1                                @ data_in_length < endpos?
 | |
|   movgt r0, #-1
 | |
|   bgt END
 | |
| 
 | |
|   @ Initializations.
 | |
|   sub r3, r5, asl #3
 | |
|   add r11, r0, r12, asl #1                  @ &data_in[delay]
 | |
|   ldr r0, [sp, #36]                         @ coefficients_length
 | |
|   add r3, r5                                @ endpos - factor * 7
 | |
| 
 | |
|   cmp r0, #0                                @ coefficients_length <= 0 ?
 | |
|   movle r0, #-1
 | |
|   ble END
 | |
| 
 | |
|   add r8, r0, asl #1                        @ &coeffieient[coefficients_length]
 | |
|   cmp r9, r3
 | |
|   bge POST_LOOP_ENDPOS                      @ branch when Iteration < 8 times.
 | |
| 
 | |
| @
 | |
| @ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
 | |
| @
 | |
|   mov r4, #-2
 | |
| 
 | |
|   @ Direct program flow to the right channel.
 | |
| 
 | |
|   @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
 | |
|   @ move the pointer back to original after advancing 16 bytes by a vld1, and
 | |
|   @ then move 2 bytes forward to increment one more sample.
 | |
|   cmp r5, #2
 | |
|   moveq r10, #-14
 | |
|   beq LOOP_ENDPOS_FACTOR2                   @ Branch when factor == 2
 | |
| 
 | |
|   @ Similar here, for r10, we need to move the pointer back to original after
 | |
|   @ advancing 32 bytes, then move 2 bytes forward to increment one sample.
 | |
|   cmp r5, #4
 | |
|   moveq r10, #-30
 | |
|   beq LOOP_ENDPOS_FACTOR4                   @ Branch when factor == 4
 | |
| 
 | |
|   @ For r10, we need to move the pointer back to original after advancing
 | |
|   @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
 | |
|   mov r10, r5, asl #4
 | |
|   rsb r10, #2
 | |
|   add r10, r5, asl #1
 | |
|   lsl r5, #1                                @ r5 = factor * sizeof(data_in)
 | |
| 
 | |
| @ The general case (factor != 2 && factor != 4)
 | |
| LOOP_ENDPOS_GENERAL:
 | |
|   @ Initializations.
 | |
|   vmov.i32 q2, #2048
 | |
|   vmov.i32 q3, #2048
 | |
|   sub r7, r8, #2
 | |
|   sub r12, r0, #1                           @ coefficients_length - 1
 | |
|   sub r1, r11, r12, asl #1                  @ &data_in[i - j]
 | |
| 
 | |
| LOOP_COEFF_LENGTH_GENERAL:
 | |
|   vld1.16 {d2[], d3[]}, [r7], r4            @ coefficients[j]
 | |
|   vld1.16 d0[0], [r1], r5                   @ data_in[i - j]
 | |
|   vld1.16 d0[1], [r1], r5                   @ data_in[i + factor - j]
 | |
|   vld1.16 d0[2], [r1], r5                   @ data_in[i + factor * 2 - j]
 | |
|   vld1.16 d0[3], [r1], r5                   @ data_in[i + factor * 3 - j]
 | |
|   vld1.16 d1[0], [r1], r5                   @ data_in[i + factor * 4 - j]
 | |
|   vld1.16 d1[1], [r1], r5                   @ data_in[i + factor * 5 - j]
 | |
|   vld1.16 d1[2], [r1], r5                   @ data_in[i + factor * 6 - j]
 | |
|   vld1.16 d1[3], [r1], r10                  @ data_in[i + factor * 7 - j]
 | |
|   subs r12, #1
 | |
|   vmlal.s16 q2, d0, d2
 | |
|   vmlal.s16 q3, d1, d3
 | |
|   bge LOOP_COEFF_LENGTH_GENERAL
 | |
| 
 | |
|   @ Shift, saturate, and store the result.
 | |
|   vqshrn.s32 d0, q2, #12
 | |
|   vqshrn.s32 d1, q3, #12
 | |
|   vst1.16 {d0, d1}, [r2]!
 | |
| 
 | |
|   add r11, r5, asl #3                       @ r11 -> &data_in[i + factor * 8]
 | |
|   add r9, r5, asl #2                        @ Counter i = delay + factor * 8.
 | |
|   cmp r9, r3                                @ i < endpos - factor * 7 ?
 | |
|   blt LOOP_ENDPOS_GENERAL
 | |
|   asr r5, #1                                @ Restore r5 to the value of factor.
 | |
|   b POST_LOOP_ENDPOS
 | |
| 
 | |
| @ The case for factor == 2.
 | |
| LOOP_ENDPOS_FACTOR2:
 | |
|   @ Initializations.
 | |
|   vmov.i32 q2, #2048
 | |
|   vmov.i32 q3, #2048
 | |
|   sub r7, r8, #2
 | |
|   sub r12, r0, #1                           @ coefficients_length - 1
 | |
|   sub r1, r11, r12, asl #1                  @ &data_in[i - j]
 | |
| 
 | |
| LOOP_COEFF_LENGTH_FACTOR2:
 | |
|   vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
 | |
|   vld2.16 {d0, d1}, [r1]!                   @ data_in[]
 | |
|   vld2.16 {d2, d3}, [r1], r10               @ data_in[]
 | |
|   subs r12, #1
 | |
|   vmlal.s16 q2, d0, d16
 | |
|   vmlal.s16 q3, d2, d17
 | |
|   bge LOOP_COEFF_LENGTH_FACTOR2
 | |
| 
 | |
|   @ Shift, saturate, and store the result.
 | |
|   vqshrn.s32 d0, q2, #12
 | |
|   vqshrn.s32 d1, q3, #12
 | |
|   vst1.16 {d0, d1}, [r2]!
 | |
| 
 | |
|   add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
 | |
|   add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
 | |
|   cmp r9, r3                                @ i < endpos - factor * 7 ?
 | |
|   blt LOOP_ENDPOS_FACTOR2
 | |
|   b POST_LOOP_ENDPOS
 | |
| 
 | |
| @ The case for factor == 4.
 | |
| LOOP_ENDPOS_FACTOR4:
 | |
|   @ Initializations.
 | |
|   vmov.i32 q2, #2048
 | |
|   vmov.i32 q3, #2048
 | |
|   sub r7, r8, #2
 | |
|   sub r12, r0, #1                           @ coefficients_length - 1
 | |
|   sub r1, r11, r12, asl #1                  @ &data_in[i - j]
 | |
| 
 | |
| LOOP_COEFF_LENGTH_FACTOR4:
 | |
|   vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
 | |
|   vld4.16 {d0, d1, d2, d3}, [r1]!           @ data_in[]
 | |
|   vld4.16 {d18, d19, d20, d21}, [r1], r10   @ data_in[]
 | |
|   subs r12, #1
 | |
|   vmlal.s16 q2, d0, d16
 | |
|   vmlal.s16 q3, d18, d17
 | |
|   bge LOOP_COEFF_LENGTH_FACTOR4
 | |
| 
 | |
|   add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
 | |
|   add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
 | |
| 
 | |
|   @ Shift, saturate, and store the result.
 | |
|   vqshrn.s32 d0, q2, #12
 | |
|   vqshrn.s32 d1, q3, #12
 | |
|   cmp r9, r3                                @ i < endpos - factor * 7 ?
 | |
|   vst1.16 {d0, d1}, [r2]!
 | |
| 
 | |
|   blt LOOP_ENDPOS_FACTOR4
 | |
| 
 | |
| @
 | |
| @ Second part, do the rest iterations (if any).
 | |
| @
 | |
| 
 | |
| POST_LOOP_ENDPOS:
 | |
|   add r3, r5, asl #3
 | |
|   sub r3, r5                                @ Restore r3 to endpos.
 | |
|   cmp r9, r3
 | |
|   movge r0, #0
 | |
|   bge END
 | |
| 
 | |
| LOOP2_ENDPOS:
 | |
|   @ Initializations.
 | |
|   mov r7, r8
 | |
|   sub r12, r0, #1                           @ coefficients_length - 1
 | |
|   sub r6, r11, r12, asl #1                  @ &data_in[i - j]
 | |
| 
 | |
|   mov r1, #2048
 | |
| 
 | |
| LOOP2_COEFF_LENGTH:
 | |
|   ldrsh r4, [r7, #-2]!                      @ coefficients[j]
 | |
|   ldrsh r10, [r6], #2                       @ data_in[i - j]
 | |
|   smlabb r1, r4, r10, r1
 | |
|   subs r12, #1
 | |
|   bge LOOP2_COEFF_LENGTH
 | |
| 
 | |
|   @ Shift, saturate, and store the result.
 | |
|   ssat r1, #16, r1, asr #12
 | |
|   strh r1, [r2], #2
 | |
| 
 | |
|   add r11, r5, asl #1                       @ r11 -> &data_in[i + factor]
 | |
|   add r9, r5                                @ Counter i = delay + factor.
 | |
|   cmp r9, r3                                @ i < endpos?
 | |
|   blt LOOP2_ENDPOS
 | |
| 
 | |
|   mov r0, #0
 | |
| 
 | |
| END:
 | |
|   pop {r4-r11}
 | |
|   bx  lr
 |