Real Checko's Blog: Code Reading - 上一篇的YUV

最後的 table : rb_clip_dummy, rb_clip, 還有 g_clip_dummy, g_clip。
dummy 都是0，然後兩個加起來， rb 是 16 x 8, g 是16 x 16。
這個大概是跟 paper 講的一樣，用 table 取代 saturation 判斷。
0 的部份就是負值的部份。

所以整個 assembly code 中沒有 compare jmp 指令 (除了最後的 line end 判斷)。
YUV 計算部份直接用 mla ( x +)，沒有使用 table -- 大概是因為 ARM 作 16x16 只要1 個 clock，所以沒必要。

一樣是作 w (寬度)，然後 line.. 一次(loop) 4 pixel。

因為先作 bit shift (5-6-5)再做查表，所以table[]不用太大。

code 沒有避免使用 mul (mla)，反而大量使用，避免不需要的 ldr 動作 (大概是 ldr 和 mla 都是一個 clk 吧)。其中：

r8 : multy 0x00012A15
r9 : (Y-16)
r6 : Coef *(V-128) + 32768
r5 : Coef*(U-128) + 32768
r4 : -Coef *(U-128) - Coef*(V-128) + 32768

剩下的就..

原來 eVC 的 ARMASM 最佳 example code 就是 OS bsp 下所有的 .s file。
所以語法參考 BSP 就可以了。
MSDN : CE .NET 4.2 ASM

大概要改的是：

MS armasm 規定只有 label 可以從一行的第一格開始，所以所有其他的 instruction, directive 都要先空
comment 是以 ; 開頭
label 不可以加 : 號
.byte 改為 DCB
.word 改為 DCD
.text 宣告要改為 AREA |.text|,CODE, ARM
.global 要改 EXPORT

reference
http://checko.blogspot.com/2006/09/writting-arm-assembly-in-embedded-vc_28.html ：

手動加入 yuv420_rgb565.s
project setting - yuv420_rgb565.s - custom build : 填入 armasm ... (Debug, Relese 都要加)

引用的 .cpp 加入：

 extern "c" void convert_yuv420_rgb565(char *,char*,int,int);

Q_Q ,, argument passing type 不一樣...

這個 assembly code 有"適當安排過". 可以看到：
有 memory access 的 instruction，接著的會是 register-only 的 operation。
這樣instruction pipeline 就可以沒有阻礙的run 下去 (如果一個 load 下一個 store, pipe line 要等 cache/memory 的同步?)
作法是：要使用之前，在 n 個 instruction 前就 load...

所以下面是重新安排後的 code，變得比較容易看....

;
;     void convert_yuv420_rgb565(AVPicture *picture, unsigned char *results, int w, int h) ;
;

AREA |.text|,CODE,ARM

EXPORT convert_yuv420_rgb565

convert_yuv420_rgb565
   stmdb   sp!, { r4 - r12, lr }   ; all callee saved regs
   ldr r7,  [r0,  #0]       ; Y ptr
   ldr r9,  [r0,  #4]       ; U ptr
   ldr r10, [r0,  #8]       ; V ptr
   subs r10, r10, r9        ; V ptr - U ptr
   ldr r8,  [r0, #12]
   add r8, r8, r7           ; Y + stride_Y
   ldr r4,  [r0, #12]       ; Stride_Y
   mov r4, r4, lsl #1
   sub r4, r4, r2           ; (2 * Stride_Y) - width
   ldr r5,  [r0, #16]       ; Stride_U
   sub r5, r5, r2, lsr #1   ; Stride_U - (width / 2)
   ldr r6,  [r0, #20]       ; Stride_V
   sub r6, r6, r2, lsr #1   ; Stride_V - (width / 2)
   add r0, r1, r2, lsl #1   ; RGB + 1
   stmdb   sp!, { r0-r10 }
   ; Stack description :
   ; (sp+ 0) RGB + one line
   ; (sp+ 4) RGB
   ; (sp+ 8) width (save)
   ; (sp+12) height
   ; (sp+16) (2 * stride_Y) - width
   ; (sp+20) stride_U - (width / 2)
   ; (sp+24) stride_V - (width / 2) !!! UNUSED !!!
   ; (sp+28) Y ptr
   ; (sp+32) Y ptr + one line
   ; (sp+36) U ptr
   ; (sp+40) V - U
   mov lr, r2                         ; Initialize the width counter
   add r0, pc, #(const_storage-.-8)   ; r0 = base pointer to the constants array
   ldr r8, [r0, #(4*4)]               ; r8 = multy  
yuv_loop

   ldr r10, [sp, #28]                 ; r10 = Y
   ldrb r9, [r10, #0]                 ; r9 = *Y

   add r10, r10, #2                   ; r10 = Y + 2
   str r10, [sp, #28]                 ; save Y + 2

   ldr r1, [sp, #36]                  ; r1 = U
   ldrb r11, [r1]                     ; r11 = *U
   add r1, r1, #1                     ;; r1 = U++
   str r1, [sp, #36]                  ; store U++

   ldr r2, [sp, #40]                  ; r2 = V - U
   add r2, r1, r2                     ; r2 = V+1
   ldrb r12, [r2, #-1]                ; r12 = *V

   sub r11, r11, #128                 ; r11 = *U - 128
   sub r12, r12, #128                 ; r12 = *V - 128

   mov r7, #32768                     ; r7 = 32768 (for additions in MLA)

   add r0, pc, #(const_storage-.-8)   ; r0 = base pointer to the constants array
   ldr r1, [r0, #(4*0)]               ; r1 = crv
   mla r6, r1, r12, r7                ; r6 = nonyc_r = crv * (*V - 128) + 32768

   ldr r2, [r0, #(4*3)]               ; r2 = -cgv
   mla r4, r2, r12, r7                ; r4 = - cgv * (*V - 128) + 32768

   ldr r3, [r0, #(4*1)]               ; r3 = cbu
   mla r5, r3, r11, r7                ; r5 = nonyc_b = cbu * (*U - 128) + 32768  

   sub r9, r9, #16                    ; r9 = *Y - 16
   mla r7, r8, r9, r6                 ; r7 = (*Y - 16) * multy + nonyc_r

   ldr r0, [r0, #(4*2)]               ; r0 = -cgu
   mla r4, r0, r11, r4                ; r4 = nonyc_g = - cgu * (*U - 128) + r4 = - cgu * (*U - 128) - cgv * (*V - 128) + 32768

   add r0, pc, #(rb_clip-.-8)         ; r0 contains the pointer to the R and B clipping array
   ldrb r7, [r0, r7, asr #(16+3)]     ; r7 = R composant
 
   mla r12, r8, r9, r5                ; r12 = (*Y - 16) * multy + nonyc_b
   mla r1, r8, r9, r4                 ; r1 = (*Y - 16) * multy + nonyc_g

   ldrb r12, [r0, r12, asr #(16+3)]   ; r12 = B composant (and the start of the RGB word)
   add r12, r12, r7, lsl #11          ; r12 = .GB ...

   add r11, pc, #(g_clip-.-8)         ; r11 now contains the pointer to the G clipping array
   ldrb r1, [r11, r1, asr #(16+2)]    ; r1 contains the G part of the RGB triplet
   add r12, r12, r1, lsl #5           ; r12 = RGB ... (ie the first pixel (half-word) is done)

   ; --- next pixel
   ldrb r9, [r10, #-1]                ; r9 = *(Y+1)
   sub r9, r9, #16                    ; r9 = *(Y+1) - 16

   mla r10, r8, r9, r6                ; r10 is the Red part of the RGB triplet
   mla r7, r8, r9, r5                 ; r7 is the Blue part of the RGB triplet
   mla r2, r8, r9, r4                 ; r2 is the Green part of the RGB triplet

   ldrb r2, [r11, r2, asr #(16+2)]    ; r2 = G composant
   add r12, r12, r2, lsl #(5+16)      ; r12 = RGB .G.
   ldrb r7, [r0, r7, asr #(16+3)]     ; r7 = B composant
   add r12, r12, r7, lsl #(0+16)      ; r12 = RGB .GB
   ldrb r10, [r0, r10, asr #(16+3)]   ; r10 = R composant
   add r12, r12, r10, lsl #(11+16)    ; r12 = RGB RGB

   ;---- do store ----
   ldr r3, [sp, #4]                   ; r3 = RGB
   add r3, r3, #4                     ; r3 = RGB++ (ie next double-pixel)
   str r3, [sp, #4]                   ; store the RGB pointer
   str r12, [r3]                      ; store the rgb pixel at *RGB

   ;---- next line ----
   ldr r1, [sp, #32]                  ; r1 = Ynext
   ldrb r9, [r1]                      ; r9 = *Ynext
   sub r9, r9, #16                    ; r9 = *Ynext - 16

   mla r2, r8, r9, r4                 ; r2 is the Green part of the RGB triplet
   mla r7, r8, r9, r5                 ; r7 is the Blue part of the RGB triplet
   mla r10, r8, r9, r6                ; r10 is the Red part of the RGB triplet

   ldrb r12, [r0, r7, asr #(16+3)]    ; r12 = ..B ...
   ldrb r10, [r0, r10, asr #(16+3)]   ; r10 = B composant
   add r12, r12, r10, lsl #11         ; r12 = R.B ...
   ldrb r2, [r11, r2, asr #(16+2)]    ; r2 = G composant
   add r12, r12, r2, lsl #5           ; r12 = RGB ...

   ;---- next pixel
   ldrb r9, [r1, #1]                  ; r9 = *(Ynext+1)
   sub r9, r9, #16                    ; r9 = *(Ynext+1) - 16

   add r1, r1, #2                     ; r1 = Ynext + 2
   str r1, [sp, #32]                  ; store the increased Ynext pointer

   mla r7, r8, r9, r5                 ; r7 is the Blue part of the RGB triplet
   mla r10, r8, r9, r6                ; r10 is the Red part of the RGB triplet
   mla r2, r8, r9, r4                 ; r2 is the Green part of the RGB triplet

   ldrb r7, [r0, r7, asr #(16+3)]     ; r7 = B composant
   add r12, r12, r7, lsl #(16+0)      ; r12 = RGB ..B
   ldrb r10, [r0, r10, asr #(16+3)]   ; r10 = R composant
   add r12, r12, r10, lsl #(16+11)    ; r12 = RGB R.B
   ldrb r2, [r11, r2, asr #(16+2)]    ; r2 = G composant
   add r12, r12, r2, lsl #(16+5)      ; r12 = RGB RGB

   ;---- do store
   ldr r3, [sp, #0]                   ; r3 = RGBnext pointer
   add r3, r3, #4                     ; r3 = next pixel on the RGBnext line
   str r12, [r3, #-4]                 ; store the next pixel
   str r3, [sp, #0]                   ; store the increased 'next line' pixel pointer

   ;-- complete, do loop --
   subs lr, lr, #2                    ; decrement the line counter
   bne yuv_loop                       ; and restart if not at the end of the line

   ldr r0, [sp, #8]                   ; r0 = saved width
   ldr r1, [sp, #0]                   ; r1 = RGBnext pointer
   mov lr, r0                         ; lr = saved width (to restart the line counter)
   str r1, [sp, #4]                   ; current RGBnext pointer is next iteration RGB pointer
   add r1, r1, r0, lsl #1             ; r1 = update RGBnext to next line
   str r1, [sp, #0]                   ; store updated RGBnext pointer

   ldr r3, [sp, #16]                  ; r3 = (2 * stride_Y) - width
   ldr r4, [sp, #28]                  ; r4 = Y ptr
   ldr r5, [sp, #32]                  ; r5 = Ynext ptr
   add r4, r4, r3                     ; r4 = Y ptr for the next two lines
   add r5, r5, r3                     ; r5 = Ynext ptr for the next two lines
   str r4, [sp, #28]                  ; store updated Y pointer
   str r5, [sp, #32]                  ; store update Ynext pointer

   ldr r1, [sp, #20]                  ; r1 = stride_U - (width / 2)
   ldr r2, [sp, #36]                  ; r2 = U ptr

   ldr r6, [sp, #12]                  ; get height counter
 
   add r2, r2, r1                     ; update U ptr
   str r2, [sp, #36]                  ; store updated U ptr (and update 'V' at the same time :-) )

   subs r6, r6, #2
   str r6, [sp, #12]
   bne yuv_loop
 
   ; Exit cleanly :-)
   add sp, sp, #(11*4)             ; remove all custom things from stack
   ldmia   sp!, { r4 - r12, pc }   ; restore callee saved regs and return


const_storage
   ; In order : crv, cbu, - cgu, - cgv, multy
   DCD 0x00019895, 0x00020469, 0xffff9bb5, 0xffff2fe1, 0x00012A15
rb_clip_dummy
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
rb_clip
       DCB 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
       DCB 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
       DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
       DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
       DCB 0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f,0x1f
g_clip_dummy
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
       DCB 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
g_clip
       DCB 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
       DCB 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f
       DCB 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f
       DCB 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
       DCB 0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f,0x3f
END

Real Checko's Blog

2.17.2009

Code Reading - 上一篇的YUV - RGB assembly

沒有留言:

網誌存檔