.macro LINE_TIMEOUT_TEST
        READ_CYCLE_COUNTER r8
        subs   r8, r8, r14
        rsbmi  r8, r8, #0
        cmp    r8, #LINE_TIMEOUT
        // Read the GPLEV0
        ldr    r8, [r4]
        eorgt  r8, r8, #CSYNC_MASK        //inverting the value after the timeout will cause the test to pass
        tst    r8, #CSYNC_MASK
.endm

.macro LINE_TIMEOUT_TEST_SKIP_HSYNC
        READ_CYCLE_COUNTER r8
        subs   r8, r8, r14
        rsbmi  r8, r8, #0
        cmp    r8, #LINE_TIMEOUT
        // Read the GPLEV0
        ldr    r8, [r4]
        eorgt  r8, r8, #CSYNC_MASK        //inverting the value after the timeout will cause the test to pass
        tst    r3, #BIT_NO_SKIP_HSYNC
        tstne  r8, #CSYNC_MASK
.endm

.macro WAIT_FOR_CSYNC_0
        READ_CYCLE_COUNTER r14
waitlo\@:
        LINE_TIMEOUT_TEST
        bne    waitlo\@
        LINE_TIMEOUT_TEST
        bne    waitlo\@
        LINE_TIMEOUT_TEST
        bne    waitlo\@
.endm

.macro WAIT_FOR_CSYNC_0_LONG
        READ_CYCLE_COUNTER r14
waitlo_long\@:
        LINE_TIMEOUT_TEST
        bne    waitlo_long\@
        LINE_TIMEOUT_TEST
        bne    waitlo_long\@
        LINE_TIMEOUT_TEST
        bne    waitlo_long\@
        LINE_TIMEOUT_TEST
        bne    waitlo_long\@
        LINE_TIMEOUT_TEST
        bne    waitlo_long\@
        LINE_TIMEOUT_TEST
        bne    waitlo_long\@
.endm

.macro WAIT_FOR_CSYNC_0_FAST_SKIP_HSYNC
        READ_CYCLE_COUNTER r14
waitloF\@:
        LINE_TIMEOUT_TEST_SKIP_HSYNC
        bne    waitloF\@
.endm

.macro WAIT_FOR_CSYNC_0_SKIP_HSYNC
        READ_CYCLE_COUNTER r14
waitlo9\@:
        LINE_TIMEOUT_TEST_SKIP_HSYNC
        bne    waitlo9\@
        LINE_TIMEOUT_TEST_SKIP_HSYNC
        bne    waitlo9\@
        LINE_TIMEOUT_TEST_SKIP_HSYNC
        bne    waitlo9\@
.endm

.macro WAIT_FOR_CSYNC_1_LONG
        READ_CYCLE_COUNTER r14
waithi_long\@:
        LINE_TIMEOUT_TEST
        beq    waithi_long\@
        LINE_TIMEOUT_TEST
        beq    waithi_long\@
        LINE_TIMEOUT_TEST
        beq    waithi_long\@
        LINE_TIMEOUT_TEST
        beq    waithi_long\@
        LINE_TIMEOUT_TEST
        beq    waithi_long\@
        LINE_TIMEOUT_TEST
        beq    waithi_long\@
.endm

.macro WAIT_FOR_CSYNC_1_FAST
        READ_CYCLE_COUNTER r14
waithiF\@:
        LINE_TIMEOUT_TEST
        beq    waithiF\@
.endm

.macro WAIT_FOR_CSYNC_1
        READ_CYCLE_COUNTER r14
waithi\@:
        LINE_TIMEOUT_TEST
        beq    waithi\@
        LINE_TIMEOUT_TEST
        beq    waithi\@
        LINE_TIMEOUT_TEST
        beq    waithi\@
.endm

.macro  SWITCH_PSYNC_TO_VSYNC
        push   {r0-r12, lr}
        mov    r0, #0
        bl     set_vsync_psync
        pop    {r0-r12, lr}
.endm

.macro  SWITCH_VSYNC_TO_PSYNC
        push   {r0-r12, lr}
        mov    r0, #1
        bl     set_vsync_psync
        pop    {r0-r12, lr}
.endm

#ifdef USE_ARM_CAPTURE

.macro WAIT_FOR_PSYNC_EDGE_FAST
waitPF\@:
        // Read the GPLEV0
        ldr    r8, [r4]
        eor    r8, r3
        tst    r8, #PSYNC_MASK
        bne    waitPF\@

        // toggle the polarity to look for the opposite edge next time
        eor    r8, r3    // restore r8 value
        eor    r3, #PSYNC_MASK
.endm

// Wait for the next edge on psync
//   if r3 bit 17 = 0 - wait for falling edge
//   if r3 bit 17 = 1 - wait for rising edge
.macro WAIT_FOR_PSYNC_EDGE
wait\@:
        // Read the GPLEV0
        ldr    r8, [r4]
        eor    r8, r3
        tst    r8, #PSYNC_MASK
        bne    wait\@

        // Read a second time to capture stable data
        // This is executed only if CPLD is V1 or V2
        tst    r3, #BIT_OLD_FIRMWARE_SUPPORT
        ldrne  r8, [r4]
        eorne  r8, r3
        tstne  r8, #PSYNC_MASK
        bne    wait\@

        // toggle the polarity to look for the opposite edge next time
        eor    r8, r3    // restore r8 value
        eor    r3, #PSYNC_MASK
.endm

.macro  SKIP_PSYNC_COMMON_NO_OLD_CPLD
        // only called if 6 bits/pixel in non-fast mode (old CPLDs v1 & v2 don't work at 6bpp so no need for test)
        WAIT_FOR_CSYNC_0_FAST_SKIP_HSYNC
        bic   r3, r3, #PSYNC_MASK    // wait for zero after CSYNC
        READ_CYCLE_COUNTER r10
        push  {r10}                 //save leading edge timestamp

        tst    r3, #BIT_HSYNC_EDGE    // if leading edge then don't wait for end of hsync (means scroll detection won't work)
        bne    do_skip_psync_no_old\@
        pop   {r10}
        mov   r6, r9, lsr #16        //HSYNC_SCROLL_HI
        bic   r9, r9, #0xff000000
        bic   r9, r9, #0x00ff0000    //HSYNC_SCROLL_LO
        // Wait for the end of hsync
        WAIT_FOR_CSYNC_1_FAST
        READ_CYCLE_COUNTER r14
        push  {r14}                 //save trailing edge timestamp
        // Calculate length of low hsync pulse (in ARM cycles = ns)
        subs   r10, r14, r10
        rsbmi  r10, r10, #0
        // Calculate length of low hsync pulse (in ARM cycles = ns)
        // Start with the configured horizontal offset
        // Implement half character horizontal scrolling:
        // - a "short"  hsync is 3.5us, leave h_offset as-is
        // - a "normal" hsync is 4.0us, increment h_offset by 1
        // - a "long"   hsync is 4.5us, increment h_offset by 2
        // So test against two thresholds inbetween these values
        bic    r3, #BIT_INHIBIT_MODE_DETECT
        // new CPLD code only (not called from CPLD v1 & v2)
        mov    r8, r7
        cmp    r10, r6              //HSYNC_SCROLL_HI
        addlt  r8, r8, #1
        orrgt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        cmp    r10, r9              //HSYNC_SCROLL_LO
        addlt  r8, r8, #1
        orrlt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        tst    r3, #BIT_NO_H_SCROLL
        moveq  r7, r8                      // only allow fine sideways scrolling in bbc / electron mode (causes timing issues in ega mode)
        // Skip the configured number of psync edges (modes 0..6: edges every 250ns, mode 7: edges ever 333ns)
do_skip_psync_no_old\@:
.endm

.macro  SKIP_PSYNC_NO_OLD_CPLD
        SKIP_PSYNC_COMMON_NO_OLD_CPLD
skip_psync_loop_no_old\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_loop_no_old\@
.endm

.macro  SKIP_PSYNC_NO_OLD_CPLD_HIGH_LATENCY
        SKIP_PSYNC_NO_OLD_CPLD
.endm

.macro  SKIP_PSYNC
        // called if 4 bits per pixel in non-fast mode so has support for old CPLV v1 & v2
        WAIT_FOR_CSYNC_0_SKIP_HSYNC
        bic    r3, r3, #PSYNC_MASK             // wait for zero after CSYNC
        READ_CYCLE_COUNTER r10
        push   {r10}

        tst    r3, #BIT_HSYNC_EDGE    // if leading edge then don't wait for end of hsync (means scroll detection won't work)
        bne    do_skip_psync\@
        pop   {r10}
        // Wait for the end of hsync
        WAIT_FOR_CSYNC_1
        READ_CYCLE_COUNTER r14
        push  {r14}
        // Calculate length of low hsync pulse (in ARM cycles = ns)
        subs   r10, r14, r10
        rsbmi  r10, r10, #0

        // Start with the configured horizontal offset
        // Implement half character horizontal scrolling:
        // - a "short"  hsync is 3.5us, leave h_offset as-is
        // - a "normal" hsync is 4.0us, increment h_offset by 1
        // - a "long"   hsync is 4.5us, increment h_offset by 2
        // So test against two thresholds inbetween these values
        bic    r3, #BIT_INHIBIT_MODE_DETECT
        mov    r8, r7
        tst    r3, #BIT_OLD_FIRMWARE_SUPPORT
        beq    notoldfirmwarescroll\@

        // old CPLD V1 & V2 code
        cmp    r10, r9, lsr #16     //HSYNC_SCROLL_HI
        addgt  r8, r8, #1
        orrgt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        bic    r9, r9, #0xff000000
        bic    r9, r9, #0x00ff0000
        cmp    r10, r9              //HSYNC_SCROLL_LO
        addgt  r8, r8, #1
        orrlt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        b      doneoldfirmwarescroll\@
notoldfirmwarescroll\@:
        // new CPLD V3 or later code
        cmp    r10, r9, lsr #16     //HSYNC_SCROLL_HI
        addlt  r8, r8, #1
        orrgt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        bic    r9, r9, #0xff000000
        bic    r9, r9, #0x00ff0000
        cmp    r10, r9              //HSYNC_SCROLL_LO
        addlt  r8, r8, #1
        orrlt  r3, r3, #BIT_INHIBIT_MODE_DETECT
doneoldfirmwarescroll\@:
        tst    r3, #BIT_NO_H_SCROLL
        moveq  r7, r8                     // only allow fine sideways scrolling in bbc / electron mode (causes timing issues in ega mode)
        // Skip the configured number of psync edges (modes 0..6: edges every 250ns, mode 7: edges ever 333ns)
do_skip_psync\@:
skip_psync_loop\@:
        WAIT_FOR_PSYNC_EDGE               // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_loop\@
.endm

.macro  SKIP_PSYNC_SIMPLE_FAST
        ldr    r8, =param_delay
        ldr    r8, [r8]
        add    r7, r7, r8
        ldr    r8, =param_sync_edge
        ldr    r8, [r8]
        orr    r3, r3, #PSYNC_MASK    // only -ve edge (inverted later)
        cmp    r8, #0
        beq    edge_trail_neg\@
        cmp    r8, #1
        beq    edge_lead_neg\@
        bic    r3, r3, #PSYNC_MASK    // only +ve edge (inverted later)
        cmp    r8, #2
        beq    edge_trail_pos\@
        cmp    r8, #3
        beq    edge_lead_pos\@
        cmp    r8, #4
        beq    edge_trail_both\@
//        cmp    r8, #5
//        beq    edge_lead_both\@
//edge_lead_both\@:
//        bic   r3, r3, #PSYNC_MASK    // wait for zero
wait_csync_lo_fast3\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        tst    r3, #BIT_NO_SKIP_HSYNC
        tstne  r8, #CSYNC_MASK
        bne    wait_csync_lo_fast3\@
        READ_CYCLE_COUNTER r10        //store timestamp in R2 instead of stack for fast mode

        b      skip_psync_loop_simple_fast\@

edge_trail_both\@:
//        bic   r3, r3, #PSYNC_MASK    // wait for zero
wait_csync_lo_fast4\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        tst    r3, #BIT_NO_SKIP_HSYNC
        tstne  r8, #CSYNC_MASK
        bne    wait_csync_lo_fast4\@
        READ_CYCLE_COUNTER r10        //store timestamp in R2 instead of stack for fast mode
        eor    r3, r3, #BIT_NO_SKIP_HSYNC
wait_csync_hi_fast4\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        tst    r3, #BIT_NO_SKIP_HSYNC
        tsteq  r8, #CSYNC_MASK
        beq    wait_csync_hi_fast4\@
        eor    r3, r3, #BIT_NO_SKIP_HSYNC
        b      skip_psync_loop_simple_fast\@

edge_lead_neg\@:
edge_lead_pos\@:
        //incoming psync state controls edge
wait_csync_lo_fast\@:
        WAIT_FOR_PSYNC_EDGE_FAST
        WAIT_FOR_PSYNC_EDGE_FAST
        tst    r3, #BIT_NO_SKIP_HSYNC
        tstne  r8, #CSYNC_MASK
        bne    wait_csync_lo_fast\@
        READ_CYCLE_COUNTER r10        //store timestamp in R2 instead of stack for fast mode
        b      skip_psync_loop_simple_fast\@

edge_trail_neg\@:
edge_trail_pos\@:
        //incoming psync state controls edge *** this one used by amiga
wait_csync_lo_fast2\@:
        WAIT_FOR_PSYNC_EDGE_FAST
        WAIT_FOR_PSYNC_EDGE_FAST
        tst    r3, #BIT_NO_SKIP_HSYNC
        tstne  r8, #CSYNC_MASK
        bne    wait_csync_lo_fast2\@
        READ_CYCLE_COUNTER r10        //store timestamp in R2 instead of stack for fast mode
        eor    r3, r3, #BIT_NO_SKIP_HSYNC
wait_csync_hi_fast\@:
        WAIT_FOR_PSYNC_EDGE_FAST
        WAIT_FOR_PSYNC_EDGE_FAST
        tst    r3, #BIT_NO_SKIP_HSYNC
        tsteq  r8, #CSYNC_MASK
        beq    wait_csync_hi_fast\@
        eor    r3, r3, #BIT_NO_SKIP_HSYNC
        // Skip the configured number of psync edges (modes 0..6: edges every 250ns, mode 7: edges ever 333ns)

skip_psync_loop_simple_fast\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_loop_simple_fast\@
        push  {r10}
.endm

#else

//**********************GPU CAPTURE**********************

.macro WAIT_FOR_PSYNC_EDGE_FAST
        push   {r9-r10}
        ldr    r10, =GPU_workspace
        ldmia  r10, {r8-r9}         //r10 is now GPU_workspace
        tst    r8, #0x00000001      //first or second sample?
        movne  r8, r8, lsr #16
        beq    read_registers\@
        b      got_sample\@
        .ltorg
read_registers\@:
    //    tst    r3, #BIT_NO_SKIP_HSYNC
    //    beq    got_sample\@
wait\@:
        ldr    r8, [r4, r9]
        eor    r8, r3
        tst    r8, #PSYNC_MASK
        tsteq  r8, #0x80000000
        beq    wait\@
        eor    r8, r3

        add    r9, r9, #4
        cmp    r9, #(GPU_SYNC_offset - GPU_DATA_0_offset)
        addeq  r9, r9, #4
        cmp    r9, #GPU_DATA_5_offset
        moveq  r9, #0
        eoreq  r3, r3, #PSYNC_MASK

got_sample\@:
        stmia  r10, {r8,r9}
        bic    r8, r8, #0x01000000
        tst    r8, #0x4000
        orrne  r8, r8, #0x01000000

        pop    {r9-r10}
.endm

.macro WAIT_FOR_PSYNC_EDGE
        WAIT_FOR_PSYNC_EDGE_FAST
.endm

.macro  SETUP_GPU_CAPTURE
        bl     _get_gpu_command_base_r10
capturebusy\@:
        ldr    r8, [r10]
        cmp    r8, #0
        bne    capturebusy\@
        //zero out ram copy and GPU data registers (r8 is already zero)
        mov    r14, #GPU_DATA_5_offset
clear_regs\@:
        str    r8, [r10, r14]
        subs   r14, r14, #4
        bpl    clear_regs\@
        ldr    r14, =GPU_workspace
        str    r8, [r14]
        str    r8, [r14, #4]
.endm

.macro  SETUP_GPU_CAPTURE_CPLD
        push   {r8}
        SETUP_GPU_CAPTURE
        add    r8, r7, r1                  //now r8 is total samples to capture (offset + video)
        tst    r3, #BIT_NO_H_SCROLL        // only allow fine sideways scrolling in bbc / electron mode (causes timing issues in ega mode)
        addeq  r8, r8, #2                  // add 2 extra samples when hscrolling to allow for shift
        tst    r3, #BIT_HSYNC_EDGE         // if leading edge then don't wait for end of hsync (means scroll detection won't work)
        addne  r8, r7, r1                  //restore r8 if leading edge as no sideways scrolling allowed
        orrne  r8, #LEADING_SYNC_FLAG
        pop    {r14}
        add    r8, r8, r14                 // adds in extra flags such as high latency capture or additional psync counts used in NTSC artfact capture
        str    r8, [r10]  //command register
.endm

.macro  SKIP_PSYNC_COMMON_NO_OLD_CPLD
        //enters with R8 containing extra gpu flags such as high latency or additional psync counts used in NTSC artfact capture
        SETUP_GPU_CAPTURE_CPLD
        WAIT_FOR_CSYNC_0_FAST_SKIP_HSYNC
        READ_CYCLE_COUNTER r10
        bic   r3, r3, #PSYNC_MASK    // wait for zero after CSYNC
        push  {r10}
        tst   r3, #BIT_HSYNC_EDGE    // if leading edge then don't wait for end of hsync (means scroll detection won't work)
        bne   do_skip_psync_no_old1\@

        pop   {r10}
        mov   r6, r9, lsr #16        //HSYNC_SCROLL_HI
        bic   r9, r9, #0xff000000
        bic   r9, r9, #0x00ff0000    //HSYNC_SCROLL_LO
        // Wait for the end of hsync
        WAIT_FOR_CSYNC_1_FAST
        READ_CYCLE_COUNTER r14
        push   {r14}     //save timestamp
        // Calculate length of low hsync pulse (in ARM cycles = ns)
        subs   r10, r14, r10
        rsbmi  r10, r10, #0
        // Calculate length of low hsync pulse (in ARM cycles = ns)
        // Start with the configured horizontal offset
        // Implement half character horizontal scrolling:
        // - a "short"  hsync is 3.5us, leave h_offset as-is
        // - a "normal" hsync is 4.0us, increment h_offset by 1
        // - a "long"   hsync is 4.5us, increment h_offset by 2
        // So test against two thresholds inbetween these values
        bic    r3, #BIT_INHIBIT_MODE_DETECT
        // new CPLD code only (not called from CPLD v1 & v2)
        mov    r8, r7
        cmp    r10, r6              //HSYNC_SCROLL_HI
        addlt  r8, r8, #1
        orrgt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        cmp    r10, r9              //HSYNC_SCROLL_LO
        addlt  r8, r8, #1
        orrlt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        tst    r3, #BIT_NO_H_SCROLL
        subeq  r10, r8, r7
        rsbeq  r10, r10, #2
        addeq  r1, r1, r10                // increase r1 if no adjustment to r7
        moveq  r7, r8                     // only allow fine sideways scrolling in bbc / electron mode (causes timing issues in ega mode)
        // Skip the configured number of psync edges (modes 0..6: edges every 250ns, mode 7: edges ever 333ns)
do_skip_psync_no_old1\@:
        bl     _get_gpu_data_base_r4
        mov    r8, #SYNC_ABORT_FLAG
        str    r8, [r4, #(GPU_COMMAND_offset - GPU_DATA_0_offset)]  //command register
.endm

.macro  SKIP_PSYNC_NO_OLD_CPLD_HIGH_LATENCY
        mov    r8, #0
        tst    r3, #BIT_RPI234
        orrne  r8, r8, #HIGH_LATENCY_FLAG  //request high latency capture (slightly faster but only really suitable for 9/12bpp modes)
        SKIP_PSYNC_COMMON_NO_OLD_CPLD
skip_psync_no_old_loop2\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_no_old_loop2\@
.endm

.macro  SKIP_PSYNC_NO_OLD_CPLD
        mov    r8, #0
        SKIP_PSYNC_COMMON_NO_OLD_CPLD
skip_psync_no_old_loop1\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_no_old_loop1\@
.endm

.macro  SKIP_PSYNC
        mov    r8, #0
        tst    r3, #BIT_OLD_FIRMWARE_SUPPORT
        orrne  r8, r8, #OLD_FIRMWARE_FLAG    //request old firmware support  (does double reads so slower but only used on 3bpp)
        SETUP_GPU_CAPTURE_CPLD
        WAIT_FOR_CSYNC_0_SKIP_HSYNC
        READ_CYCLE_COUNTER r10
        bic   r3, r3, #PSYNC_MASK    // wait for zero after CSYNC
        push  {r10}
        tst   r3, #BIT_HSYNC_EDGE    // if leading edge then don't wait for end of hsync (means scroll detection won't work)
        bne   do_skip_psync3\@

        pop   {r10}
        // Wait for the end of hsync
        WAIT_FOR_CSYNC_1
        READ_CYCLE_COUNTER r14
        push   {r14}     //save timestamp

        // Calculate length of low hsync pulse (in ARM cycles = ns)
        subs   r10, r14, r10
        rsbmi  r10, r10, #0

        // Start with the configured horizontal offset
        // Implement half character horizontal scrolling:
        // - a "short"  hsync is 3.5us, leave h_offset as-is
        // - a "normal" hsync is 4.0us, increment h_offset by 1
        // - a "long"   hsync is 4.5us, increment h_offset by 2
        // So test against two thresholds inbetween these values
        bic    r3, #BIT_INHIBIT_MODE_DETECT
        mov    r8, r7
        tst    r3, #BIT_OLD_FIRMWARE_SUPPORT
        beq    notoldfirmwarescroll\@

        // old CPLD V1 & V2 code
        cmp    r10, r9, lsr #16     //HSYNC_SCROLL_HI
        addgt  r8, r8, #1
        orrgt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        bic    r9, r9, #0xff000000
        bic    r9, r9, #0x00ff0000
        cmp    r10, r9              //HSYNC_SCROLL_LO
        addgt  r8, r8, #1
        orrlt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        b      doneoldfirmwarescroll\@
notoldfirmwarescroll\@:
        // new CPLD V3 or later code
        cmp    r10, r9, lsr #16     //HSYNC_SCROLL_HI
        addlt  r8, r8, #1
        orrgt  r3, r3, #BIT_INHIBIT_MODE_DETECT
        bic    r9, r9, #0xff000000
        bic    r9, r9, #0x00ff0000
        cmp    r10, r9              //HSYNC_SCROLL_LO
        addlt  r8, r8, #1
        orrlt  r3, r3, #BIT_INHIBIT_MODE_DETECT
doneoldfirmwarescroll\@:
        tst    r3, #BIT_NO_H_SCROLL
        subeq  r10, r8, r7
        rsbeq  r10, r10, #2
        addeq  r1, r1, r10                // increase r1 if no adjustment to r7
        moveq  r7, r8                     // only allow fine sideways scrolling in bbc / electron mode (causes timing issues in ega mode)

        // Skip the configured number of psync edges (modes 0..6: edges every 250ns, mode 7: edges ever 333ns)
do_skip_psync3\@:
        bl     _get_gpu_data_base_r4
        mov    r8, #SYNC_ABORT_FLAG
        str    r8, [r4, #(GPU_COMMAND_offset - GPU_DATA_0_offset)]  //command register
skip_psync_loop\@:
        WAIT_FOR_PSYNC_EDGE               // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_loop\@

.endm

.macro  SKIP_PSYNC_SIMPLE_FAST
        SETUP_GPU_CAPTURE
        ldr    r8, =param_delay
        ldr    r8, [r8]
        add    r7, r7, r8
        bl     _get_gpu_data_base_r4
        add    r8, r7, r1
        tst    r3, #BIT_RPI234
        orrne  r8, r8, #HIGH_LATENCY_FLAG       //request high latency capture (slightly faster but only really suitable for 9/12bpp modes)
        ldr    r9, =param_sync_edge
        ldr    r9, [r9]
        tst    r3, #BIT_NO_SKIP_HSYNC
        orrne  r8, r8, #SIMPLE_SYNC_FLAG        //flag sync command
        orrne  r8, r9, lsl #16                  //or in sync command
        str    r8, [r4, #(GPU_COMMAND_offset - GPU_DATA_0_offset)]  //command register
        beq    skip_psync_simple_fast\@
wait_for_simple_sync\@:
        ldr    r8, [r4, #(GPU_SYNC_offset - GPU_DATA_0_offset)]     //sync register
        tst    r8, #1
        beq    wait_for_simple_sync\@
skip_psync_simple_fast\@:
        READ_CYCLE_COUNTER r10
        push  {r10}
        bic   r3, r3, #PSYNC_MASK    // wait for zero after CSYNC
skip_psync_loop_simple_fast_loop\@:
        WAIT_FOR_PSYNC_EDGE_FAST           // wait for next edge of psync
        subs   r7, r7, #1
        bne    skip_psync_loop_simple_fast_loop\@
.endm

//**********************GPU CAPTURE END**********************

#endif

.macro CAPTURE_LOW_BITS
        // Pixel 0 in GPIO  4.. 2 ->  7.. 4
        // Pixel 1 in GPIO  7.. 5 ->  3.. 0
        // Pixel 2 in GPIO 10.. 8 -> 15..12
        // Pixel 3 in GPIO 13..11 -> 11.. 8

        and    r10, r8, #(7 << PIXEL_BASE)
        and    r9, r8, #(7 << (PIXEL_BASE + 3))
        mov    r10, r10, lsl #(4 - PIXEL_BASE)
        orr    r10, r10, r9, lsr #(3 + PIXEL_BASE)

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r10, r10, r9, lsl #(6 - PIXEL_BASE)
        orr    r10, r10, r8, lsr #(1 + PIXEL_BASE)
.endm

.macro CAPTURE_HIGH_BITS
        // Pixel 4 in GPIO  4.. 2 -> 23..20
        // Pixel 5 in GPIO  7.. 5 -> 19..16
        // Pixel 6 in GPIO 10.. 8 -> 31..28
        // Pixel 7 in GPIO 13..11 -> 27..24

        and    r9, r8, #(7 << PIXEL_BASE)
        and    r14, r8, #(7 << (PIXEL_BASE + 3))
        orr    r10, r10, r9, lsl #(20 - PIXEL_BASE)
        orr    r10, r10, r14, lsl #(13 - PIXEL_BASE)

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r10, r10, r9, lsl #(22 - PIXEL_BASE)
        orr    r10, r10, r8, lsl #(15 - PIXEL_BASE)
.endm

.macro CAPTURE_LOW_BITS_NORMAL reg
        // Pixel 0 in GPIO  4.. 2 ->  7.. 4
        // Pixel 1 in GPIO  7.. 5 ->  3.. 0
        // Pixel 2 in GPIO 10.. 8 -> 15..12
        // Pixel 3 in GPIO 13..11 -> 11.. 8

        and    r9, r8, #(7 << PIXEL_BASE)
        and    r14, r8, #(7 << (PIXEL_BASE + 3))
        eor    r10, \reg, r9, lsl #(4 - PIXEL_BASE)
        eor    r10, r10, r14, lsr #(3 + PIXEL_BASE)

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r14, r8, #(7 << (PIXEL_BASE + 9))
        eor    r10, r10, r9, lsl #(6 - PIXEL_BASE)
        eor    r10, r10, r14, lsr #(1 + PIXEL_BASE)
.endm

.macro CAPTURE_HIGH_BITS_NORMAL reg
        // Pixel 4 in GPIO  4.. 2 -> 23..20
        // Pixel 5 in GPIO  7.. 5 -> 19..16
        // Pixel 6 in GPIO 10.. 8 -> 31..28
        // Pixel 7 in GPIO 13..11 -> 27..24

        and    r9, r8, #(7 << PIXEL_BASE)
        and    r14, r8, #(7 << (PIXEL_BASE + 3))
        eor    r10, r10, r9, lsl #(20 - PIXEL_BASE)
        eor    r10, r10, r14, lsl #(13 - PIXEL_BASE)

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r14, r8, #(7 << (PIXEL_BASE + 9))
        eor    r10, r10, r9, lsl #(22 - PIXEL_BASE)
        eor    \reg, r10, r14, lsl #(15 - PIXEL_BASE)
.endm

.macro CAPTURE_BITS_DOUBLE reg reg2
        // Pixel 0 in GPIO  4.. 2 ->  7.. 4 and  3.. 0
        // Pixel 1 in GPIO  7.. 5 -> 15..12 and 11.. 8
        // Pixel 2 in GPIO 10.. 8 -> 23..20 and 19..16
        // Pixel 3 in GPIO 13..11 -> 31..28 and 27..24

        and    r9, r8, #(7 << PIXEL_BASE)
        and    r14, r8, #(7 << (PIXEL_BASE + 3))
        eor    r10, \reg, r9, lsl #(4 - PIXEL_BASE)
        eor    r10, r10, r14, lsl #(12 - (PIXEL_BASE + 3))

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r14, r8, #(7 << (PIXEL_BASE + 9))
        eor    r10, r10, r9, lsl #(20 - (PIXEL_BASE + 6))
        eor    r10, r10, r14, lsl #(28 - (PIXEL_BASE + 9))

        // Pixel double
        orr    \reg2, r10, r10, lsr #4
.endm

.macro CAPTURE_0_BITS_WIDE reg
        // Pixel 0 in GPIO  7.. 2 ->  7.. 4
        // Pixel 1 in GPIO 13.. 8 ->  3.. 0

        and    r9, r8, #(0x07 << PIXEL_BASE)
        and    r14, r8, #(0x07 << (PIXEL_BASE + 6))
        eor    r10, \reg, r9, lsl #(4 - PIXEL_BASE)
        eor    r10, r10, r14, lsr #(6 + PIXEL_BASE)
.endm

.macro CAPTURE_1_BITS_WIDE
        // Pixel 0 in GPIO  7.. 2 ->  15.. 12
        // Pixel 1 in GPIO 13.. 8 ->  11.. 8

        and    r9, r8, #(0x07 << (PIXEL_BASE))
        and    r14, r8, #(0x07 << (PIXEL_BASE + 6))
        eor    r10, r10, r9, lsl #(12 - PIXEL_BASE)
        eor    r10, r10, r14, lsr #(2 - PIXEL_BASE)
.endm

.macro CAPTURE_2_BITS_WIDE
        // Pixel 4 in GPIO  7.. 2 -> 23..20
        // Pixel 5 in GPIO 13.. 8 -> 19..16
        and    r9, r8, #(0x07 << PIXEL_BASE)
        and    r14, r8, #(0x07 << (PIXEL_BASE + 6))
        eor    r10, r10, r9, lsl #(20 - PIXEL_BASE)
        eor    r10, r10, r14, lsl #(10 - PIXEL_BASE)
.endm

.macro CAPTURE_3_BITS_WIDE reg
        // Pixel 6 in GPIO 7.. 2 -> 31..28
        // Pixel 7 in GPIO 13..8 -> 27..24
        and    r9, r8, #(0x07 << PIXEL_BASE)
        and    r14, r8, #(0x07 << (PIXEL_BASE + 6))
        eor    r10, r10, r9, lsl #(28 - PIXEL_BASE)
        eor    \reg, r10, r14, lsl #(18 - PIXEL_BASE)
.endm

.macro CAPTURE_LOW_BITS_DOUBLE_WIDE reg
        // Pixel 0 in GPIO  7.. 2 ->  7.. 4
        // Pixel 1 in GPIO 13.. 8 ->  15.. 12

        and    r9, r8, #(0x07 << PIXEL_BASE)
        and    r14, r8, #(0x07 << (PIXEL_BASE + 6))
        eor    r10, \reg, r9, lsl #(4 - PIXEL_BASE)
        eor    r10, r10, r14, lsl #(2 + PIXEL_BASE)
.endm

.macro CAPTURE_HIGH_BITS_DOUBLE_WIDE reg
        // Pixel 2 in GPIO  7.. 2 -> 23..20
        // Pixel 3 in GPIO 13.. 8 -> 31..28
        and    r9, r8, #(0x07 << PIXEL_BASE)
        and    r14, r8, #(0x07 << (PIXEL_BASE + 6))
        eor    r10, r10, r9, lsl #(20 - PIXEL_BASE)
        eor    r10, r10, r14, lsl #(22 - PIXEL_BASE)

        // Pixel double
        orr    \reg, r10, r10, lsr #4
.endm

.macro CAPTURE_BITS_8BPP
        // Pixel 0 in GPIO  4.. 2 ->  7.. 0
        // Pixel 1 in GPIO  7.. 5 -> 15.. 8
        // Pixel 2 in GPIO 10.. 8 -> 23..16
        // Pixel 3 in GPIO 13..11 -> 31..24

        and    r10, r8, #(7 << PIXEL_BASE)
        and    r9, r8, #(7 << (PIXEL_BASE + 3))
        mov    r10, r10, lsr #(PIXEL_BASE)
        orr    r10, r10, r9, lsl #(8 - (PIXEL_BASE + 3))

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r10, r10, r9, lsl #(16 - (PIXEL_BASE + 6))
        orr    r10, r10, r8, lsl #(24 - (PIXEL_BASE + 9))
.endm

.macro CAPTURE_BITS_8BPP_NORMAL reg reg2
        // Pixel 0 in GPIO  4.. 2 ->  7.. 0
        // Pixel 1 in GPIO  7.. 5 -> 15.. 8
        // Pixel 2 in GPIO 10.. 8 -> 23..16
        // Pixel 3 in GPIO 13..11 -> 31..24

        and    r9, r8, #(7 << PIXEL_BASE)
        and    r14, r8, #(7 << (PIXEL_BASE + 3))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r10, r10, r14, lsl #(8 - (PIXEL_BASE + 3))

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r14, r8, #(7 << (PIXEL_BASE + 9))
        eor    r10, r10, r9, lsl #(16 - (PIXEL_BASE + 6))
        eor    \reg2, r10, r14, lsl #(24 - (PIXEL_BASE + 9))
.endm

.macro CAPTURE_LOW_BITS_DOUBLE_8BPP reg reg2
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 23..16

        and    r9, r8, #(7 << PIXEL_BASE)
        and    r14, r8, #(7 << (PIXEL_BASE + 3))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r10, r10, r14, lsl #(16 - (PIXEL_BASE + 3))
        // Pixel double
        orr    \reg2, r10, r10, lsl #8
.endm

.macro CAPTURE_HIGH_BITS_DOUBLE_8BPP reg reg2
        // Pixel 2 in GPIO  7.. 2 ->  7.. 0
        // Pixel 3 in GPIO 13.. 8 -> 23..16

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r14, r8, #(7 << (PIXEL_BASE + 9))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE + 6)
        eor    r10, r10, r14, lsl #(16 - (PIXEL_BASE + 9))
        // Pixel double
        orr    \reg2, r10, r10, lsl #8
.endm


.macro CAPTURE_LOW_BITS_8BPP_WIDE reg
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 15.. 8

        and    r9, r8, #(0x3f << PIXEL_BASE)
        and    r14, r8, #(0x3f << (PIXEL_BASE + 6))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r10, r10, r14, lsl #(8 - (PIXEL_BASE + 6))
.endm

.macro CAPTURE_HIGH_BITS_8BPP_WIDE reg
        // Pixel 2 in GPIO  7.. 2 -> 23..16
        // Pixel 3 in GPIO 13.. 8 -> 31..24

        and    r9, r8, #(0x3f << PIXEL_BASE)
        and    r14, r8, #(0x3f << (PIXEL_BASE + 6))
        eor    r10, r10, r9, lsl #(16 - PIXEL_BASE)
        eor    \reg, r10, r14, lsl #(24 - (PIXEL_BASE + 6))
.endm

.macro CAPTURE_LOW_BITS_ODD_EVEN_8BPP_WIDE reg
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 15.. 8

        and    r9, r8, #(0x3f << PIXEL_BASE)
        and    r14, r8, #(0x3f << (PIXEL_BASE))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r10, r10, r14, lsl #(8 - (PIXEL_BASE))
.endm

.macro CAPTURE_HIGH_BITS_ODD_EVEN_8BPP_WIDE reg
        // Pixel 2 in GPIO  7.. 2 -> 23..16
        // Pixel 3 in GPIO 13.. 8 -> 31..24

        and    r9, r8, #(0x3f << PIXEL_BASE)
        and    r14, r8, #(0x3f << (PIXEL_BASE))
        eor    r10, r10, r9, lsl #(16 - PIXEL_BASE)
        eor    \reg, r10, r14, lsl #(24 - (PIXEL_BASE))
.endm

.macro CAPTURE_BITS_DOUBLE_8BPP_WIDE reg reg2
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 23..16

        and    r9, r8, #(0x3f << PIXEL_BASE)
        and    r14, r8, #(0x3f << (PIXEL_BASE + 6))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r10, r10, r14, lsl #(16 - (PIXEL_BASE + 6))
        // Pixel double
        orr    \reg2, r10, r10, lsl #8
.endm

.macro CAPTURE_BITS_DOUBLE_ODD_EVEN_8BPP_WIDE reg reg2
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 23..16

        and    r9, r8, #(0x3f << PIXEL_BASE)
        and    r14, r8, #(0x3f << (PIXEL_BASE))
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r10, r10, r14, lsl #(16 - (PIXEL_BASE))
        // Pixel double
        orr    \reg2, r10, r10, lsl #8
.endm


.macro CAPTURE_SIX_BITS_16BPP reg1 reg2
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 15.. 8
        and    r9, r8, #(0x3f << PIXEL_BASE)
        ldr    r9, [r14, r9]
        eor    r10, r9, \reg1
        and    r9, r8, #(0x3f << (PIXEL_BASE + 6))
        ldr    r9, [r14, r9, lsr #6]
        eor    \reg2, r10, r9, lsl #16
.endm

.macro CAPTURE_SIX_BITS_DOUBLE_16BPP_LO reg1 reg2
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 15.. 8
        and    r9, r8, #(0x3f << PIXEL_BASE)
        ldr    r9, [r14, r9]
        eor    r10, r9, \reg1
        eor    \reg2, r10, r9, lsl #16
.endm

.macro CAPTURE_SIX_BITS_DOUBLE_16BPP_HI reg1 reg2
        // Pixel 0 in GPIO  7.. 2 ->  7.. 0
        // Pixel 1 in GPIO 13.. 8 -> 15.. 8
        and    r9, r8, #(0x3f << (PIXEL_BASE + 6))
        ldr    r9, [r14, r9, lsr #6]
        eor    r10, r9, \reg1
        eor    \reg2, r10, r9, lsl #16
.endm

.macro SETUP_EIGHT_BITS_MASK_R14
        tst    r3, #BIT_OSD
        movne  r14, #0x7f
        moveq  r14, #0xff
        mov    r14, r14, lsl #3
.endm

// This extracts 8 bits from 9 or 12 bpp capture to be written to an 8 bit buffer
// the bits are in the order R3,R2,R1,R0,G3,G2,G1,G0,B3,B2,B1,B0 so the correct 8 bits have to be extracted
// which would be B1,R1,B2,G2,R2,B3,G3,R3 however there isn't enough time to rearrange all bits
// so just mask out the unwanted ones and move the wanted ones in their place.
// This means the bit order is different from 3 bpp and 6 bpp but that can be fixed by reordering the bits
// in the palette lookup table


.macro  BIT_SHIFT_EIGHT_BITS
        and    r9, r8, #(0xCC << PIXEL_BASE)  // extract 0,G3,G2,0,0,B3,B2,0,x,x,x (shifted left by PIXEL_BASE + 1 to put red lsb rather than green msb in top bit)
        tst    r8, #(0x02 << PIXEL_BASE)      // move B1
        orrne  r9, r9, #(0x100 << PIXEL_BASE)
        tst    r8, #(0x800 << PIXEL_BASE)     // move R3
        orrne  r9, r9, #(0x20 << PIXEL_BASE)
        tst    r8, #(0x400 << PIXEL_BASE)     // move R2
        orrne  r9, r9, #(0x10 << PIXEL_BASE)
        tst    r8, #(0x200 << PIXEL_BASE)     // move R1
        orrne  r9, r9, #(0x02 << PIXEL_BASE)  // order is now B1,G3,G2,R3,R2,B3,B2,R1,x,x,x
        and    r9, r9, r14                    // mask out top bit if OSD is on (B1)
        tst    r3, #BITDUP_ENABLE_FFOSD       // code for FFOSD in 8bpp mode
        tstne  r8, #MUX_MASK
        orrne  r3, r3, #BITDUP_FFOSD_DETECTED
        orrne  r9, r9, #(0x100 << PIXEL_BASE)
.endm

.macro CAPTURE_EIGHT_BITS_8BPP_0 reg
        // Pixel 0 in GPIO ->  7.. 0

        BIT_SHIFT_EIGHT_BITS
        eor    r10, \reg, r9, lsr #(PIXEL_BASE + 1)
.endm

.macro CAPTURE_EIGHT_BITS_8BPP_1
        // Pixel 0 in GPIO ->  15.. 8

        BIT_SHIFT_EIGHT_BITS
        eor    r10, r10, r9, lsl #(7 - PIXEL_BASE)
.endm

.macro CAPTURE_EIGHT_BITS_8BPP_2
        // Pixel 0 in GPIO ->  23.. 16

        BIT_SHIFT_EIGHT_BITS
        eor    r10, r10, r9, lsl #(15 - PIXEL_BASE)
.endm

.macro CAPTURE_EIGHT_BITS_8BPP_3 reg
        // Pixel 0 in GPIO ->  31.. 24

        BIT_SHIFT_EIGHT_BITS
        eor    \reg, r10, r9, lsl #(23 - PIXEL_BASE)
.endm

.macro CAPTURE_EIGHT_BITS_DOUBLE_8BPP_LO reg
        // Pixel 0 in GPIO ->  7.. 0

        BIT_SHIFT_EIGHT_BITS
        eor    r10, \reg, r9, lsr #(PIXEL_BASE + 1)
.endm

.macro CAPTURE_EIGHT_BITS_DOUBLE_8BPP_HI reg
        // Pixel 0 in GPIO ->  23.. 16
        BIT_SHIFT_EIGHT_BITS
        eor    r10, r10, r9, lsl #(15 - PIXEL_BASE)
        // Pixel double
        orr    \reg, r10, r10, lsl #8
.endm


.macro SETUP_NINELO_BITS_MASK_R14
        mov     r14, #0x77 << PIXEL_BASE
        orr     r14, r14, #0x700 << PIXEL_BASE
.endm

.macro CAPTURE_NINELO_BITS_16BPP_LO reg
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        bic    r8, r8, r14, lsr #1
        eor    r10, \reg, r9, lsr #(PIXEL_BASE - 1)
        and    r8, r8, r14
        eor    r10, r10, r8, lsr #(PIXEL_BASE + 2)
.endm

.macro CAPTURE_NINELO_BITS_16BPP_HI reg
        // Pixel in GPIO 13.. 2 -> 31.. 16
        and    r9, r8, r14
        bic    r8, r8, r14, lsr #1
        eor    r10, r10, r9, lsl #(16 - (PIXEL_BASE - 1))
        and    r8, r8, r14
        eor    \reg, r10, r8, lsl #(16 - (PIXEL_BASE + 2))
.endm

.macro CAPTURE_NINELO_BITS_DOUBLE_16BPP reg reg2
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        bic    r8, r8, r14, lsr #1
        eor    r10, \reg, r9, lsr #(PIXEL_BASE - 1)
        and    r8, r8, r14
        eor    r10, r10, r8, lsr #(PIXEL_BASE + 2)

        eor    r10, r10, r9, lsl #(16 - (PIXEL_BASE - 1))
        eor    \reg2, r10, r8, lsl #(16 - (PIXEL_BASE + 2))
.endm

.macro SETUP_NINEHI_BITS_MASK_R14
        mov     r14, #0xee << PIXEL_BASE
        orr     r14, r14, #0xe00 << PIXEL_BASE
.endm

.macro CAPTURE_NINEHI_BITS_16BPP_LO reg
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        bic    r8, r8, r14, lsr #1
        eor    r10, \reg, r9, lsr #PIXEL_BASE
        and    r8, r8, r14
        eor    r10, r10, r8, lsr #(PIXEL_BASE + 3)
.endm

.macro CAPTURE_NINEHI_BITS_16BPP_HI reg
        // Pixel in GPIO 13.. 2 -> 31.. 16
        and    r9, r8, r14
        bic    r8, r8, r14, lsr #1
        eor    r10, r10, r9, lsl #(16 - PIXEL_BASE)
        and    r8, r8, r14
        eor    \reg, r10, r8, lsl #(16 - (PIXEL_BASE + 3))
.endm

.macro CAPTURE_NINEHI_BITS_DOUBLE_16BPP reg reg2
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        bic    r8, r8, r14, lsr #1
        eor    r10, \reg, r9, lsr #PIXEL_BASE
        and    r8, r8, r14
        eor    r10, r10, r8, lsr #(PIXEL_BASE + 3)

        eor    r10, r10, r9, lsl #(16 - PIXEL_BASE)
        eor    \reg2, r10, r8, lsl #(16 - (PIXEL_BASE + 3))
.endm

.macro SETUP_TWELVE_BITS_MASK_R14
        mov     r14, #0xff << PIXEL_BASE
        orr     r14, r14, #0xf00 << PIXEL_BASE
.endm

.macro CAPTURE_TWELVE_BITS_16BPP_LO reg
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
.endm

.macro CAPTURE_TWELVE_BITS_16BPP_HI reg
        // Pixel in GPIO 13.. 2 -> 31.. 16
        and    r9, r8, r14
        eor    \reg, r10, r9, lsl #(16 - PIXEL_BASE)
.endm

.macro TEST_CAPTURE_TWELVE_BITS_16BPP_LO reg
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        tst    r8, #MUX_MASK
        orrne  r3, #BITDUP_FFOSD_DETECTED
.endm

.macro TEST_CAPTURE_TWELVE_BITS_16BPP_HI reg
        // Pixel in GPIO 13.. 2 -> 31.. 16
        and    r9, r8, r14
        eor    \reg, r10, r9, lsl #(16 - PIXEL_BASE)
        tst    r8, #MUX_MASK
        orrne  r3, #BITDUP_FFOSD_DETECTED
.endm

.macro OSD_CAPTURE_TWELVE_BITS_16BPP_LO reg
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    r9, r9, #(GREY_PIXELS & 0x0ff) << PIXEL_BASE
        eors   r9, r9, #(GREY_PIXELS & 0xf00) << PIXEL_BASE
        bicne  r3, #BITDUP_LINE_CONDITION_DETECTED
        tst    r8, #MUX_MASK
        orrne  r3, #BITDUP_FFOSD_DETECTED
        orrne  r10, #0xff00
        orrne  r10, #0x00ff
.endm

.macro OSD_CAPTURE_TWELVE_BITS_16BPP_HI reg
        // Pixel in GPIO 13.. 2 -> 31.. 16
        and    r9, r8, r14
        eor    \reg, r10, r9, lsl #(16 - PIXEL_BASE)
        eor    r9, r9, #(GREY_PIXELS & 0x0ff) << PIXEL_BASE
        eors   r9, r9, #(GREY_PIXELS & 0xf00) << PIXEL_BASE
        bicne  r3, #BITDUP_LINE_CONDITION_DETECTED
        tst    r8, #MUX_MASK
        orrne  r3, #BITDUP_FFOSD_DETECTED
        orrne  \reg, \reg, #(0xff000000)
        orrne  \reg, \reg, #(0x00ff0000)
.endm

.macro CAPTURE_TWELVE_BITS_DOUBLE_16BPP reg reg2
        // Pixel in GPIO 13.. 2 -> 15.. 0
        and    r9, r8, r14
        eor    r10, \reg, r9, lsr #(PIXEL_BASE)
        eor    \reg2, r10, r9, lsl #(16 - PIXEL_BASE)
.endm

.macro CAPTURE_LOW_BITS_TRANSLATE
        // Pixel 0 in GPIO  4.. 2 ->  7.. 4
        // Pixel 1 in GPIO  7.. 5 ->  3.. 0
        // Pixel 2 in GPIO 10.. 8 -> 15..12
        // Pixel 3 in GPIO 13..11 -> 11.. 8

        and    r10, r8, #(7 << PIXEL_BASE)
        and    r9, r8, #(7 << (PIXEL_BASE + 3))
        mov    r10, r10, lsl #(4 - PIXEL_BASE)
        orr    r10, r10, r9, lsr #(3 + PIXEL_BASE)

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r10, r10, r9, lsl #(6 - PIXEL_BASE)
        orr    r10, r10, r8, lsr #(1 + PIXEL_BASE)

        mov    r6, r6, lsl #8   // mode 0 sentinel
        mov    r7, r7, lsl #2   // mode 0-6 sentinel
        mov    r14, #0          // mode 2 translation

        tst    r10, #0x00000070
        orrne  r14, r14, #0x08
        orrne  r6, r6, #0x80
        tst    r10, #0x00000007
        orrne  r14, r14, #0x080000
        orrne  r6, r6, #0x40
        tst    r10, #0x00007000
        orrne  r14, r14, #0x04
        orrne  r6, r6, #0x20
        tst    r10, #0x00000700
        orrne  r14, r14, #0x040000
        orrne  r6, r6, #0x10
        orrne  r7, r7, #2
.endm

.macro CAPTURE_HIGH_BITS_TRANSLATE
        // Pixel 4 in GPIO  4.. 2 -> 23..20
        // Pixel 5 in GPIO  7.. 5 -> 19..16
        // Pixel 6 in GPIO 10.. 8 -> 31..28
        // Pixel 7 in GPIO 13..11 -> 27..24

        and    r9, r8, #(7 << PIXEL_BASE)           // this block unoptimised to free up r14
        orr    r10, r10, r9, lsl #(20 - PIXEL_BASE)
        and    r9, r8, #(7 << (PIXEL_BASE + 3))
        orr    r10, r10, r9, lsl #(13 - PIXEL_BASE)

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r10, r10, r9, lsl #(22 - PIXEL_BASE)
        orr    r10, r10, r8, lsl #(15 - PIXEL_BASE)

        tst    r10, #0x00700000
        orrne  r14, r14, #0x02
        orrne  r6, r6, #0x08
        tst    r10, #0x00070000
        orrne  r14, r14, #0x020000
        orrne  r6, r6, #0x04
        tst    r10, #0x70000000
        orrne  r14, r14, #0x01
        orrne  r6, r6, #0x02
        tst    r10, #0x07000000
        orrne  r14, r14, #0x010000
        orrne  r6, r6, #0x01
        orrne  r7, r7, #1

        tst    r3, #BITDUP_MODE2_16COLOUR
        orrne  r10, r14, r14, lsl #4
        orrne  r10, r10, r10, lsl #8

.endm

.macro CAPTURE_LOW_BITS_TRANSLATE_8BPP
        // Pixel 0 in GPIO  4.. 2 ->  7.. 0
        // Pixel 1 in GPIO  7.. 5 -> 15.. 8
        // Pixel 2 in GPIO 10.. 8 -> 23..16
        // Pixel 3 in GPIO 13..11 -> 31..24

        and    r5, r8, #(7 << PIXEL_BASE)
        and    r9, r8, #(7 << (PIXEL_BASE + 3))
        mov    r5, r5, lsr #(PIXEL_BASE)
        orr    r5, r5, r9, lsl #(8 - (PIXEL_BASE + 3))

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r5, r5, r9, lsl #(16 - (PIXEL_BASE + 6))
        orr    r5, r5, r8, lsl #(24 - (PIXEL_BASE + 9))

        mov    r6, r6, lsl #8   // mode 0 sentinel
        mov    r7, r7, lsl #2   // mode 0-6 sentinel
        mov    r14, #0          // mode 2 translation (low byte = left pixel, high byte = right pixel
        tst    r5, #0x00000007
        orrne  r14, r14, #0x08
        orrne  r6, r6, #0x80
        tst    r5, #0x00000700
        orrne  r14, r14, #0x08000000
        orrne  r6, r6, #0x40
        tst    r5, #0x00070000
        orrne  r14, r14, #0x04
        orrne  r6, r6, #0x20
        tst    r5, #0x07000000
        orrne  r14, r14, #0x04000000
        orrne  r6, r6, #0x10
        orrne  r7, r7, #2
.endm

.macro CAPTURE_HIGH_BITS_TRANSLATE_8BPP
        // Pixel 0 in GPIO  4.. 2 ->  7.. 0
        // Pixel 1 in GPIO  7.. 5 -> 15.. 8
        // Pixel 2 in GPIO 10.. 8 -> 23..16
        // Pixel 3 in GPIO 13..11 -> 31..24

        and    r10, r8, #(7 << PIXEL_BASE)
        and    r9, r8, #(7 << (PIXEL_BASE + 3))
        mov    r10, r10, lsr #(PIXEL_BASE)
        orr    r10, r10, r9, lsl #(8 - (PIXEL_BASE + 3))

        and    r9, r8, #(7 << (PIXEL_BASE + 6))
        and    r8, r8, #(7 << (PIXEL_BASE + 9))
        orr    r10, r10, r9, lsl #(16 - (PIXEL_BASE + 6))
        orr    r10, r10, r8, lsl #(24 - (PIXEL_BASE + 9))

        tst    r10, #0x00000007
        orrne  r14, r14, #0x02
        orrne  r6, r6, #0x08
        tst    r10, #0x00000700
        orrne  r14, r14, #0x02000000
        orrne  r6, r6, #0x04
        tst    r10, #0x00070000
        orrne  r14, r14, #0x01
        orrne  r6, r6, #0x02
        tst    r10, #0x07000000
        orrne  r14, r14, #0x01000000
        orrne  r6, r6, #0x01
        orrne  r7, r7, #1

        tst    r3, #BITDUP_MODE2_16COLOUR
        moveq  r9, r5
        andne  r9, r14, #0xff
        orrne  r9, r9, r9, lsl #8
        orrne  r9, r9, r9, lsl #16
        andne  r10, r14, #0xff000000
        orrne  r10, r10, r10, lsr #8
        orrne  r10, r10, r10, lsr #16
.endm

.macro  SETUP_VSYNC_DEBUG_R11
        tst     r3, #BIT_VSYNC_MARKER
        ldrne   r11, =0x11111111
        moveq   r11, #0
        tst     r3, #BIT_DEBUG
        eorne   r11, r11, #0x50         //magenta in leftmost
        eorne   r11, r11, #0x02000000   //green in rightmost
.endm

.macro  SETUP_VSYNC_DEBUG_R11_DOUBLE
        tst     r3, #BIT_VSYNC_MARKER
        ldrne   r11, =0x10101010
        moveq   r11, #0
        tst     r3, #BIT_DEBUG
        eorne   r11, r11, #0x50         //magenta in leftmost
        eorne   r11, r11, #0x20000000   //green in rightmost << 4
.endm

.macro  WRITE_R7_IF_LAST
        cmp     r1, #1
        stmeqia r0, {r7}
        tsteq   r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE
        ldreq   r8, =0x88888888
        orreq   r7, r7, r8
        cmp     r1, #1
        tsteq   r3,  #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r7}
.endm

.macro  WRITE_R7_R10
        stmia   r0, {r7, r10}
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE
        ldreq   r8, =0x88888888
        orreq   r7, r7, r8
        orreq   r10, r10, r8
        tst     r3,  #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r7, r10}
        addeq   r0, r0, r2
        add     r0, r0, #8
.endm

.macro  SETUP_VSYNC_DEBUG_R11_R12
        tst     r3, #BIT_VSYNC_MARKER
        ldrne   r11, =0x40404040
        moveq   r11, #0
        movne   r12, r11
        moveq   r12, #0
        tst     r3, #BIT_DEBUG
        eorne   r11, r11, #0x05           //magenta in leftmost
        eorne   r12, r12, #0x02000000     //green in rightmost
.endm

.macro  SETUP_VSYNC_DEBUG_R11_R12_DOUBLE
        tst     r3, #BIT_VSYNC_MARKER
        ldrne   r11, =0x00400040
        moveq   r11, #0
        movne   r12, r11
        moveq   r12, #0
        tst     r3, #BIT_DEBUG
        eorne   r11, r11, #0x05           //magenta in leftmost
        eorne   r12, r12, #0x00020000     //green in rightmost >> 8
.endm

.macro  SETUP_VSYNC_DEBUG_16BPP_R11
        tst     r3, #BIT_OSD
        ldreq   r11, =#0xf000f000
        ldrne   r11, =#0x70007000
        tst     r3, #BIT_VSYNC_MARKER
        eorne   r11, r11, #0x0f000000
        eorne   r11, r11, #0x00000f00

        tst     r3, #BITDUP_RGB_INVERT
        eorne   r11, r11, #0x0f000000
        eorne   r11, r11, #0x00000f00
        eorne   r11, r11, #0x00ff0000
        eorne   r11, r11, #0x000000ff

        tst     r3, #BITDUP_Y_INVERT
        eorne   r11, r11, #0x00f00000
        eorne   r11, r11, #0x000000f0

        tst     r3, #BIT_NO_SCANLINES | BIT_INTERLACED_VIDEO
        ldreq   r12, =param_intensity
        ldreq   r12, [r12]
.endm

.macro  SETUP_VSYNC_DEBUG_NOINVERT_16BPP_R11
        tst     r3, #BIT_OSD
        ldreq   r11, =#0xf000f000
        ldrne   r11, =#0x70007000
        tst     r3, #BIT_VSYNC_MARKER
        eorne   r11, r11, #0x0f000000
        eorne   r11, r11, #0x00000f00

        tst     r3, #BIT_NO_SCANLINES | BIT_INTERLACED_VIDEO
        ldreq   r12, =param_intensity
        ldreq   r12, [r12]
.endm

.macro  WRITE_R5_R6
        stmia   r0, {r5, r6}
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x80808080
        orreq   r5, r5, r8
        orreq   r6, r6, r8
        tst     r3, #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r5, r6}
        addeq   r0, r0, r2
        add     r0, r0, #8
.endm

.macro  WRITE_R5_R6_IF_LAST_16BPP
        cmp     r1, #1
        stmeqia r0, {r5, r6}
        tsteq   r3, #BIT_NO_SCANLINES | BIT_INTERLACED_VIDEO
        eoreq   r5, r5, r12
        eoreq   r6, r6, r12
        cmp     r1, #1
        tsteq   r3, #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r5, r6}
.endm

.macro  WRITE_R5_R6_R7_R10_16BPP
        stmia   r0, {r5, r6, r7, r10}
        tst     r3, #BIT_NO_SCANLINES | BIT_INTERLACED_VIDEO
        eoreq   r5, r5, r12
        eoreq   r6, r6, r12
        eoreq   r7, r7, r12
        eoreq   r10, r10, r12

        tst     r3, #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r5, r6, r7, r10}
        addeq   r0, r0, r2
        add     r0, r0, #16
.endm


.macro  WRITE_R5_R6_IF_LAST
        cmp     r1, #1
        stmeqia r0, {r5, r6}
        tsteq   r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x80808080
        orreq   r5, r5, r8
        orreq   r6, r6, r8
        cmp     r1, #1
        tsteq   r3, #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r5, r6}
.endm

.macro  WRITE_R5_R6_R7_R10
        stmia   r0, {r5, r6, r7, r10}
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x80808080
        orreq   r5, r5, r8
        orreq   r6, r6, r8
        orreq   r7, r7, r8
        orreq   r10, r10, r8
        tst     r3, #BIT_NO_LINE_DOUBLE
        subeq   r0, r0, r2
        stmeqia r0, {r5, r6, r7, r10}
        addeq   r0, r0, r2
        add     r0, r0, #16
.endm




.macro WRITE_WORD_FAST
        eor    r10, r10, r6     //eor in vsync and debug
        str    r10, [r0]
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x88888888
        orreq   r10, r10, r8
        tst    r3,  #BIT_NO_LINE_DOUBLE
        streq  r10, [r0, -r2]
        add    r0, r0, #4
.endm

.macro WRITE_WORDS_8BPP_FAST
        eor    r9, r9, r5       //eor in vsync and debug
        eor    r10, r10, r6     //eor in vsync and debug
        stmia  r0, {r9, r10}
        sub    r0, r0, r2
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x80808080
        orreq   r9, r9, r8
        orreq   r10, r10, r8
        tst    r3, #BIT_NO_LINE_DOUBLE
        stmeqia  r0, {r9, r10}
        add    r0, r0, r2
        add    r0, r0, #8
.endm

.macro WRITE_WORD
        tst     r3, #BIT_VSYNC_MARKER
        ldrne  r8, =0x11111111
        eorne  r10, r10, r8            // eor in the VSync indicator (orr doesn't work on zx80/81 due to white screen)
        tst    r3, #BIT_DEBUG
        eorne  r10, r10, #0x50         //magenta in leftmost
        eorne  r10, r10, #0x02000000   //green in rightmost
        str    r10, [r0]
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x88888888
        orreq   r10, r10, r8
        tst    r3,  #BIT_NO_LINE_DOUBLE
        streq  r10, [r0, -r2]
        add    r0, r0, #4
.endm

.macro WRITE_WORDS_8BPP
        and    r8, r3, #MASKDUP_PALETTE_HIGH_NIBBLE
        mov    r8, r8, lsr #(OFFSETDUP_PALETTE_HIGH_NIBBLE - 4)
        orr    r8, r8, r8, lsl #8
        orr    r8, r8, r8, lsl #16
        orr    r9, r9, r8
        orr    r10, r10, r8
        tst    r3, #BIT_VSYNC_MARKER
        ldrne  r8, =0x40404040
        eorne  r9, r9, r8              // eor in the VSync indicator (orr doesn't work on zx80/81 due to white screen)
        eorne  r10, r10, r8
        tst    r3, #BIT_DEBUG
        eorne  r9, r9, #0x05           //magenta in leftmost
        eorne  r10, r10, #0x02000000   //green in rightmost
        stmia  r0, {r9, r10}
        sub    r0, r0, r2
        tst     r3, #BIT_NO_SCANLINES | BIT_OSD | BIT_NO_LINE_DOUBLE | BIT_INTERLACED_VIDEO
        ldreq   r8, =0x80808080
        orreq   r9, r9, r8
        orreq   r10, r10, r8
        tst    r3, #BIT_NO_LINE_DOUBLE
        stmeqia  r0, {r9, r10}
        add    r0, r0, r2
        add    r0, r0, #8
.endm

.macro SETUP_DUMMY_PARAMETERS
        ldr     r0, =(dummyscreen + 1024)    //in case data written backwards
        mov     r1, #8
        mov     r2, #0
        orr     r3, r3, #BIT_VSYNC_MARKER    // ensure that constants are in data cache
        bic     r3, r3, #BIT_NO_SKIP_HSYNC
        mov     r5, #1
        mov     r6, #0
        mov     r7, #4
        mov     r8, #256
        mov     r9, #0  //force skip of wait for csync 0
.endm

// ======================================================================
// Macros
// ======================================================================


// Data Synchronisation Barrier
.macro _DSB
        push   {r0, lr}
        bl     _get_hardware_id
        cmp    r0, #_RPI2
        blt    rpi0_1_a\@
        dsb
        b      donerpi0_1_a\@
rpi0_1_a\@:
        mcr    p15, 0, r0, c7, c10, 4
donerpi0_1_a\@:
        pop    {r0, lr}
.endm

// Data Memory Barrier
.macro _DMB
        push   {r0, lr}
        bl     _get_hardware_id
        cmp    r0, #_RPI2
        blt    rpi0_1_b\@
        dmb
        b      donerpi0_1_b\@
rpi0_1_b\@:
        mcr    p15, 0, r0, c7, c10, 5
donerpi0_1_b\@:
        pop    {r0, lr}
.endm

.macro READ_CYCLE_COUNTER reg
        tst    r3, #BIT_RPI234
        bne    skip_armv6_instruction\@
        mrceq  p15, 0, \reg, c15, c12, 1   //this arm v6 instruction works on arm v7 but faults on arm v8 even if condition is set to skip
skip_armv6_instruction\@:
        mrcne  p15, 0, \reg, c9, c13, 0    //this arm v7 / v8 instruction gets skipped on arm v6 without faulting (saves a branch)
.endm

#ifdef MULTI_BUFFER
.macro FLIP_BUFFER
        // Skip the multi buffering in mode 7 and probe mode
        tst    r3, #(BIT_INTERLACED_VIDEO | BIT_PROBE)
        bne    noflip\@
        // Flip to the last completed draw buffer
        // It seems the GPU delays this until the next vsync
        push   {r0-r3}
        mov    r14, r3, lsr #OFFSET_LAST_BUFFER
        and    r0, r14, #3
        bl     swapBuffer
        pop    {r0-r3}
noflip\@:
.endm
#endif

.macro KEY_PRESS_DETECT mask, ret, counter
        //enters with sw1_power_up in r9
        ldr    r6, =\counter    // Load the counter value
        ldr    r5, [r6]
        cmp    r9, #0
        movne  r5, #0
        tst    r8, #\mask      // Is the button pressed (active low)?
        movne  r5, #0          // Clear the counter
        addeq  r5, r5, #1      // If pressed, then increment the counter valye
        str    r5, [r6]        // And always write back the counter value

        cmp    r5, #1          // Counter goes from 0->1 when key initially
        orreq  r0, #\ret       // Indicate the initial press in the result

        cmp    r5, #32         // 32 = auto repeat delay
        tstge  r5, #7          // 7  = auto repeat rate
        orreq  r0, #\ret       // Indicate the auto repeated press in the result

        cmp    r5, #128        // 128 = auto repeat delay
        tstge  r5, #3          // 3 = auto repeat rate
        orreq  r0, #\ret       // Indicate the auto repeated press in the result

        cmp    r5, #256        // 256 = auto repeat delay
        tstge  r5, #1          // 1 = auto repeat rate
        orreq  r0, #\ret       // Indicate the auto repeated press in the result
.endm
