2.16.2009

YUV to RGB in ARMv4 assembly

以前 ipaq 3630 familiar 的bbplayer 有一段code作 yuv to rgb..
(http://www.koders.com/noncode/fid9B79A2EAD6C3F6EE8454AB93E5D9F77A1C509D19.aspx?s=mp3)
用 assembly 寫的,因為看到 版權宣告是 free 的,所以把全部內容都貼出來:

/*
Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

/* WARNING : this function only works when stride_U == stride_V (I use some hacks to
not have to do too many computations at line's end)...

C-like prototype :
void convert_yuv420_rgb565(AVPicture *picture, unsigned char *results, unsigned char *_py,unsigned char *_pu,unsigned char *_pv);

*/

#ifdef __arm__

.text
.align

.global convert_yuv420_rgb565
convert_yuv420_rgb565:

stmdb sp!, { r4 - r12, lr } @ all callee saved regs
ldr r7,[r0,#0] @dest width
ldr r9,[r0,#4] @dest height
ldr r10,[r0, #12] @ sourcewidth
ldr r5, [r0, #16] @ source_height
mul r4,r5,r10 @zll
mov r4,r4,lsr#2 @ Vptr - U ptr
ldr r6, [r0, #20] @ rgbstrid
ldr r8,[r0, #12] @zll width
add r8, r8, r2 @ Y + stride_Y
add r0,r1,r6 @ RGB + 1
stmdb sp!, { r0-r10 }
@ Stack description :
@ (sp+ 0) RGB + one line r0
@ (sp+ 4) RGB r1
@ (sp+ 8) _py r2
@ (sp+12) _pu r3
@ (sp+16) _pv - _pu r4
@ (sp+20) sourceheight r5
@ (sp+24) rgbstrid r6
@ (sp+28) destwidth r7
@ (sp+32) Ynext r8
@ (sp+36) destheight r9
@ (sp+40) sourcewidth r10

mov lr,r10 @ Initialize the width counter
add r0, pc, #(const_storage-.-8) @ r0 = base pointer to the constants array
ldr r8, [r0, #(4*4)] @ r8 = multy
yuv_loop:
add r0, pc, #(const_storage-.-8) @ r0 = base pointer to the constants array
ldr r10, [sp, #8] @ r10 = Y ...
ldr r1, [sp, #12] @ r1 = U ...
ldrb r9, [r10, #0] @ r9 = *Y ...
ldrb r11, [r1] @ r11 = *U
add r1, r1, #1 @ r1 = U++
ldr r2, [sp, #16] @ r2 = V - U ...
str r1, [sp, #12] @ store U++
add r2, r2, r1 @ r2 = V+1
ldrb r12, [r2, #-1] @ r12 = *V
sub r11, r11, #128 @ r11 = *U - 128
sub r12, r12, #128 @ r12 = *V - 128
ldr r1, [r0, #(4*0)] @ r1 = crv
mov r7, #32768 @ r7 = 32768 (for additions in MLA)
ldr r2, [r0, #(4*3)] @ r2 = -cgv
mla r6, r1, r12, r7 @ r6 = nonyc_r = crv * (*V - 128) + 32768
ldr r3, [r0, #(4*1)] @ r3 = cbu
mla r4, r2, r12, r7 @ r4 = - cgv * (*V - 128) + 32768
sub r9, r9, #16 @ r9 = *Y - 16
mla r5, r3, r11, r7 @ r5 = nonyc_b = cbu * (*U - 128) + 32768
ldr r0, [r0, #(4*2)] @ r0 = -cgu
mla r7, r8, r9, r6 @ r7 = (*Y - 16) * multy + nonyc_r
add r10, r10, #2 @ r10 = Y + 2
mla r4, r0, r11, r4 @ r4 = nonyc_g = - cgu * (*U - 128) + r4 = - cgu * (*U - 128) - cgv * (*V - 128) + 32768
add r0, pc, #(rb_clip-.-8) @ r0 contains the pointer to the R and B clipping array
mla r12, r8, r9, r5 @ r12 = (*Y - 16) * multy + nonyc_b
ldrb r7, [r0, r7, asr #(16+3)] @ r7 = R composant
mla r1, r8, r9, r4 @ r1 = (*Y - 16) * multy + nonyc_g
ldrb r9, [r10, #-1] @ r9 = *(Y+1)
str r10, [sp, #8] @ save Y + 2
ldrb r12, [r0, r12, asr #(16+3)] @ r12 = B composant (and the start of the RGB word)
add r11, pc, #(g_clip-.-8) @ r11 now contains the pointer to the G clipping array
ldrb r1, [r11, r1, asr #(16+2)] @ r1 contains the G part of the RGB triplet
sub r9, r9, #16 @ r9 = *(Y+1) - 16
mla r10, r8, r9, r6 @ r10 is the Red part of the RGB triplet
add r12, r12, r7, lsl #11 @ r12 = .GB ...
mla r7, r8, r9, r5 @ r7 is the Blue part of the RGB triplet
add r12, r12, r1, lsl #5 @ r12 = RGB ... (ie the first pixel (half-word) is done)
mla r2, r8, r9, r4 @ r2 is the Green part of the RGB triplet
ldrb r10, [r0, r10, asr #(16+3)] @ r10 = R composant
ldrb r7, [r0, r7, asr #(16+3)] @ r7 = B composant
ldr r1, [sp, #32] @ r1 = Ynext
ldrb r2, [r11, r2, asr #(16+2)] @ r2 = G composant
ldrb r9, [r1] @ r9 = *Ynext
add r12, r12, r2, lsl #(5+16) @ r12 = RGB .G.
sub r9, r9, #16 @ r9 = *Ynext - 16
mla r2, r8, r9, r4 @ r2 is the Green part of the RGB triplet
add r12, r12, r7, lsl #(0+16) @ r12 = RGB .GB
mla r7, r8, r9, r5 @ r7 is the Blue part of the RGB triplet
add r12, r12, r10, lsl #(11+16) @ r12 = RGB RGB
ldr r3, [sp, #4] @ r3 = RGB
mla r10, r8, r9, r6 @ r10 is the Red part of the RGB triplet
str r12, [r3] @ store the rgb pixel at *RGB
add r3, r3, #4 @ r3 = RGB++ (ie next double-pixel)
str r3, [sp, #4] @ store the RGB pointer
ldrb r9, [r1, #1] @ r9 = *(Ynext+1)
add r1, r1, #2 @ r1 = Ynext + 2
sub r9, r9, #16 @ r9 = *(Ynext+1) - 16
ldrb r12, [r0, r7, asr #(16+3)] @ r12 = ..B ...
ldrb r10, [r0, r10, asr #(16+3)] @ r10 = B composant
mla r7, r8, r9, r5 @ r7 is the Blue part of the RGB triplet
add r12, r12, r10, lsl #11 @ r12 = R.B ...
ldrb r2, [r11, r2, asr #(16+2)] @ r2 = G composant
mla r10, r8, r9, r6 @ r10 is the Red part of the RGB triplet
add r12, r12, r2, lsl #5 @ r12 = RGB ...
mla r2, r8, r9, r4 @ r2 is the Green part of the RGB triplet
ldrb r7, [r0, r7, asr #(16+3)] @ r7 = B composant
str r1, [sp, #32] @ store the increased Ynext pointer
add r12, r12, r7, lsl #(16+0) @ r12 = RGB ..B
ldrb r10, [r0, r10, asr #(16+3)] @ r10 = R composant
ldr r3, [sp, #0] @ r3 = RGBnext pointer
add r12, r12, r10, lsl #(16+11) @ r12 = RGB R.B
ldrb r2, [r11, r2, asr #(16+2)] @ r2 = G composant
add r3, r3, #4 @ r3 = next pixel on the RGBnext line
add r12, r12, r2, lsl #(16+5) @ r12 = RGB RGB
str r12, [r3, #-4] @ store the next pixel
str r3, [sp, #0] @ store the increased 'next line' pixel pointer
subs lr, lr, #2 @ decrement the line counter
bne yuv_loop @ and restart if not at the end of the line


ldr r0, [sp, #40] @ r0 = saved sourcewidth ....


ldr r1, [sp, #0] @ r1 = RGBnext pointer
ldr r2, [sp, #24] @ zll rgbstrid
mov lr, r0 @ lr = saved width (to restart the line counter)

subs r3,r2,r0,lsl#1 @ (rgbstride - 2 width)
add r1,r1,r3 @ the nest two RGBline
str r1, [sp, #4] @ current RGBnext pointer is next iteration RGB pointer
add r1,r1,r2 @ r1 = update RGBnext to next line
str r1, [sp, #0] @ store updated RGBnext pointer

ldr r3, [sp, #40] @ sourcewidth
ldr r4, [sp, #8] @ r4 = Y ptr
ldr r5, [sp, #32] @ r5 = Ynext ptr
add r4, r4, r3 @ r4 = Y ptr for the next two lines
add r5, r5, r3 @ r5 = Ynext ptr for the next two lines
str r4, [sp, #8] @ store updated Y pointer
str r5, [sp, #32] @ store update Ynext pointer


ldr r6, [sp, #20] @ get height counter
subs r6, r6, #2
str r6, [sp, #20]
bne yuv_loop

@ Exit cleanly :-)
add sp, sp, #(11*4) @ remove all custom things from stack
ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return


const_storage:
@ In order : crv, cbu, - cgu, - cgv, multy
.word 0x00019895, 0x00020469, 0xffff9bb5, 0xffff2fe1, 0x00012A15
rb_clip_dummy:
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
rb_clip:
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
.byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
.byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
.byte 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
g_clip_dummy:
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
g_clip:
.byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
.byte 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
.byte 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f
.byte 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
.byte 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f

#endif

沒有留言: