From 1353d8feca19f2f84019797942d70864054db1b0 Mon Sep 17 00:00:00 2001 From: Ben Avison <bavison@riscosopen.org> Date: Mon, 5 Aug 2013 13:12:46 +0100 Subject: [PATCH 01/94] h264_parser: Initialize the h264dsp context in the parser as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each AVStream struct for an H.264 elementary stream actually has two copies of the H264DSPContext struct (and in fact all the other members of H264Context as well): ((H264Context *) ((AVStream *)st)->codec->priv_data)->h264dsp ((H264Context *) ((AVStream *)st)->parser->priv_data)->h264dsp but only the first of these was actually being initialised. This prevented the addition of platform-specific implementations of parser-related functions. Signed-off-by: Martin Storsjö <martin@martin.st> --- lib/ffmpeg/libavcodec/h264_parser.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/ffmpeg/libavcodec/h264_parser.c b/lib/ffmpeg/libavcodec/h264_parser.c index aff9ba1..a732f79 100644 --- a/lib/ffmpeg/libavcodec/h264_parser.c +++ b/lib/ffmpeg/libavcodec/h264_parser.c @@ -386,6 +386,7 @@ static int init(AVCodecParserContext *s) H264Context *h = s->priv_data; h->thread_context[0] = h; h->slice_context_count = 1; + ff_h264dsp_init(&h->h264dsp, 8, 1); return 0; } -- 1.9.3 From 7ea2cb68f6fb1149fce70854e36ed6357a267238 Mon Sep 17 00:00:00 2001 From: Ben Avison <bavison@riscosopen.org> Date: Mon, 5 Aug 2013 13:12:47 +0100 Subject: [PATCH 02/94] h264dsp: Factorize code into a new function, h264_find_start_code_candidate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This performs the start code search which was previously part of h264_find_frame_end() - the most CPU intensive part of the function. By itself, this results in a performance regression: Before After Mean StdDev Mean StdDev Change Overall time 2925.6 26.2 3068.5 31.7 -4.7% but this can more than be made up for by platform-optimised implementations of the function. Signed-off-by: Martin Storsjö <martin@martin.st> --- lib/ffmpeg/libavcodec/h264_parser.c | 20 +++----------------- lib/ffmpeg/libavcodec/h264dsp.c | 29 +++++++++++++++++++++++++++++ lib/ffmpeg/libavcodec/h264dsp.h | 9 +++++++++ 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/lib/ffmpeg/libavcodec/h264_parser.c b/lib/ffmpeg/libavcodec/h264_parser.c index a732f79..972aace 100644 --- a/lib/ffmpeg/libavcodec/h264_parser.c +++ b/lib/ffmpeg/libavcodec/h264_parser.c @@ -62,23 +62,9 @@ static int ff_h264_find_frame_end(H264Context *h, const uint8_t *buf, int buf_si } if(state==7){ -#if HAVE_FAST_UNALIGNED - /* we check i<buf_size instead of i+3/7 because its simpler - * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end - */ -# if HAVE_FAST_64BIT - while(i<next_avc && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL)) - i+=8; -# else - while(i<next_avc && !((~*(const uint32_t*)(buf+i) & (*(const uint32_t*)(buf+i) - 0x01010101U)) & 0x80808080U)) - i+=4; -# endif -#endif - for(; i<next_avc; i++){ - if(!buf[i]){ - state=2; - break; - } + i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i); + if (i < buf_size) + state = 2; } }else if(state<=2){ if(buf[i]==1) state^= 5; //2->7, 1->4, 0->5 diff --git a/lib/ffmpeg/libavcodec/h264dsp.c b/lib/ffmpeg/libavcodec/h264dsp.c index da9e417..b7d61cd 100644 --- a/lib/ffmpeg/libavcodec/h264dsp.c +++ b/lib/ffmpeg/libavcodec/h264dsp.c @@ -60,6 +60,34 @@ #include "h264addpx_template.c" #undef BIT_DEPTH +static int h264_find_start_code_candidate_c(const uint8_t *buf, int size) +{ + int i = 0; +#if HAVE_FAST_UNALIGNED + /* we check i < size instead of i + 3 / 7 because it is + * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE + * bytes at the end. + */ +#if HAVE_FAST_64BIT + while (i < size && + !((~*(const uint64_t *)(buf + i) & + (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & + 0x8080808080808080ULL)) + i += 8; +#else + while (i < size && + !((~*(const uint32_t *)(buf + i) & + (*(const uint32_t *)(buf + i) - 0x01010101U)) & + 0x80808080U)) + i += 4; +#endif +#endif + for (; i < size; i++) + if (!buf[i]) + break; + return i; +} + void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { #undef FUNC @@ -146,6 +174,7 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo H264_DSP(8); break; } + c->h264_find_start_code_candidate = h264_find_start_code_candidate_c; if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); diff --git a/lib/ffmpeg/libavcodec/h264dsp.h b/lib/ffmpeg/libavcodec/h264dsp.h index 98ea15c..1be4804 100644 --- a/lib/ffmpeg/libavcodec/h264dsp.h +++ b/lib/ffmpeg/libavcodec/h264dsp.h @@ -105,6 +105,15 @@ typedef struct H264DSPContext { /* bypass-transform */ void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); + + /** + * Search buf from the start for up to size bytes. Return the index + * of a zero byte, or >= size if not found. Ideally, use lookahead + * to filter out any zero bytes that are known to not be followed by + * one or more further zero bytes and a one byte. Better still, filter + * out any bytes that form the trailing_zero_8bits syntax element too. + */ + int (*h264_find_start_code_candidate)(const uint8_t *buf, int size); } H264DSPContext; void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, -- 1.9.3 From 458ff4b6c1855c529f563dbbd15e35aaab50adae Mon Sep 17 00:00:00 2001 From: Ben Avison <bavison@riscosopen.org> Date: Mon, 5 Aug 2013 13:12:48 +0100 Subject: [PATCH 03/94] arm: Add assembly version of h264_find_start_code_candidate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before After Mean StdDev Mean StdDev Change This function 508.8 23.4 185.4 9.0 +174.4% Overall 3068.5 31.7 2752.1 29.4 +11.5% In combination with the preceding patch: Before After Mean StdDev Mean StdDev Change Overall 2925.6 26.2 2752.1 29.4 +6.3% Signed-off-by: Martin Storsjö <martin@martin.st> --- lib/ffmpeg/libavcodec/arm/Makefile | 1 + lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S | 253 +++++++++++++++++++++++++++ lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c | 4 + lib/ffmpeg/libavcodec/h264_parser.c | 1 - 4 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S diff --git a/lib/ffmpeg/libavcodec/arm/Makefile b/lib/ffmpeg/libavcodec/arm/Makefile index 7390a8b..480000b71 100644 --- a/lib/ffmpeg/libavcodec/arm/Makefile +++ b/lib/ffmpeg/libavcodec/arm/Makefile @@ -9,6 +9,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o \ OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o +ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ arm/flacdsp_arm.o \ diff --git a/lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S b/lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S new file mode 100644 index 0000000..c4f12a6 --- /dev/null +++ b/lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison <bavison@riscosopen.org> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +RESULT .req a1 +BUF .req a1 +SIZE .req a2 +PATTERN .req a3 +PTR .req a4 +DAT0 .req v1 +DAT1 .req v2 +DAT2 .req v3 +DAT3 .req v4 +TMP0 .req v5 +TMP1 .req v6 +TMP2 .req ip +TMP3 .req lr + +#define PRELOAD_DISTANCE 4 + +.macro innerloop4 + ldr DAT0, [PTR], #4 + subs SIZE, SIZE, #4 @ C flag survives rest of macro + sub TMP0, DAT0, PATTERN, lsr #14 + bic TMP0, TMP0, DAT0 + ands TMP0, TMP0, PATTERN +.endm + +.macro innerloop16 decrement, do_preload + ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} + .ifnc "\do_preload","" + pld [PTR, #PRELOAD_DISTANCE*32] + .endif + .ifnc "\decrement","" + subs SIZE, SIZE, #\decrement @ C flag survives rest of macro + .endif + sub TMP0, DAT0, PATTERN, lsr #14 + sub TMP1, DAT1, PATTERN, lsr #14 + bic TMP0, TMP0, DAT0 + bic TMP1, TMP1, DAT1 + sub TMP2, DAT2, PATTERN, lsr #14 + sub TMP3, DAT3, PATTERN, lsr #14 + ands TMP0, TMP0, PATTERN + bic TMP2, TMP2, DAT2 + it eq + andseq TMP1, TMP1, PATTERN + bic TMP3, TMP3, DAT3 + itt eq + andseq TMP2, TMP2, PATTERN + andseq TMP3, TMP3, PATTERN +.endm + +/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */ +function ff_h264_find_start_code_candidate_armv6, export=1 + push {v1-v6,lr} + mov PTR, BUF + @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go + @ before using code that does preloads + cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 + blo 60f + + @ Get to word-alignment, 1 byte at a time + tst PTR, #3 + beq 2f +1: ldrb DAT0, [PTR], #1 + sub SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + tst PTR, #3 + bne 1b +2: @ Get to 4-word alignment, 1 word at a time + ldr PATTERN, =0x80008000 + setend be + tst PTR, #12 + beq 4f +3: innerloop4 + bne 91f + tst PTR, #12 + bne 3b +4: @ Get to cacheline (8-word) alignment + tst PTR, #16 + beq 5f + innerloop16 16 + bne 93f +5: @ Check complete cachelines, with preloading + @ We need to stop when there are still (PRELOAD_DISTANCE+1) + @ complete cachelines to go + sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 +6: innerloop16 , do_preload + bne 93f + innerloop16 32 + bne 93f + bcs 6b + @ Preload trailing part-cacheline, if any + tst SIZE, #31 + beq 7f + pld [PTR, #(PRELOAD_DISTANCE+1)*32] + @ Check remaining data without doing any more preloads. First + @ do in chunks of 4 words: +7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 + bmi 9f +8: innerloop16 16 + bne 93f + bcs 8b + @ Then in words: +9: adds SIZE, SIZE, #16 - 4 + bmi 11f +10: innerloop4 + bne 91f + bcs 10b +11: setend le + @ Check second byte of final halfword + ldrb DAT0, [PTR, #-1] + teq DAT0, #0 + beq 90f + @ Check any remaining bytes + tst SIZE, #3 + beq 13f +12: ldrb DAT0, [PTR], #1 + sub SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + tst SIZE, #3 + bne 12b + @ No candidate found +13: sub RESULT, PTR, BUF + b 99f + +60: @ Small buffer - simply check by looping over bytes + subs SIZE, SIZE, #1 + bcc 99f +61: ldrb DAT0, [PTR], #1 + subs SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + bcs 61b + @ No candidate found + sub RESULT, PTR, BUF + b 99f + +90: @ Found a candidate at the preceding byte + sub RESULT, PTR, BUF + sub RESULT, RESULT, #1 + b 99f + +91: @ Found a candidate somewhere in the preceding 4 bytes + sub RESULT, PTR, BUF + sub RESULT, RESULT, #4 + sub TMP0, DAT0, #0x20000 + bics TMP0, TMP0, DAT0 + itt pl + ldrbpl DAT0, [PTR, #-3] + addpl RESULT, RESULT, #2 + bpl 92f + teq RESULT, #0 + beq 98f @ don't look back a byte if found at first byte in buffer + ldrb DAT0, [PTR, #-5] +92: teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f + +93: @ Found a candidate somewhere in the preceding 16 bytes + sub RESULT, PTR, BUF + sub RESULT, RESULT, #16 + teq TMP0, #0 + beq 95f @ not in first 4 bytes + sub TMP0, DAT0, #0x20000 + bics TMP0, TMP0, DAT0 + itt pl + ldrbpl DAT0, [PTR, #-15] + addpl RESULT, RESULT, #2 + bpl 94f + teq RESULT, #0 + beq 98f @ don't look back a byte if found at first byte in buffer + ldrb DAT0, [PTR, #-17] +94: teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +95: add RESULT, RESULT, #4 + teq TMP1, #0 + beq 96f @ not in next 4 bytes + sub TMP1, DAT1, #0x20000 + bics TMP1, TMP1, DAT1 + itee mi + ldrbmi DAT0, [PTR, #-13] + ldrbpl DAT0, [PTR, #-11] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +96: add RESULT, RESULT, #4 + teq TMP2, #0 + beq 97f @ not in next 4 bytes + sub TMP2, DAT2, #0x20000 + bics TMP2, TMP2, DAT2 + itee mi + ldrbmi DAT0, [PTR, #-9] + ldrbpl DAT0, [PTR, #-7] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +97: add RESULT, RESULT, #4 + sub TMP3, DAT3, #0x20000 + bics TMP3, TMP3, DAT3 + itee mi + ldrbmi DAT0, [PTR, #-5] + ldrbpl DAT0, [PTR, #-3] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + @ drop through to 98f +98: setend le +99: pop {v1-v6,pc} +.endfunc + + .unreq RESULT + .unreq BUF + .unreq SIZE + .unreq PATTERN + .unreq PTR + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq TMP0 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 diff --git a/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c b/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c index 785b604..2804e56 100644 --- a/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c +++ b/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c @@ -24,6 +24,8 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/h264dsp.h" +int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); + void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, @@ -106,6 +108,8 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, { int cpu_flags = av_get_cpu_flags(); + if (have_armv6(cpu_flags)) + c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; if (have_neon(cpu_flags)) ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc); } diff --git a/lib/ffmpeg/libavcodec/h264_parser.c b/lib/ffmpeg/libavcodec/h264_parser.c index 972aace..363843c 100644 --- a/lib/ffmpeg/libavcodec/h264_parser.c +++ b/lib/ffmpeg/libavcodec/h264_parser.c @@ -65,7 +65,6 @@ static int ff_h264_find_frame_end(H264Context *h, const uint8_t *buf, int buf_si i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i); if (i < buf_size) state = 2; - } }else if(state<=2){ if(buf[i]==1) state^= 5; //2->7, 1->4, 0->5 else if(buf[i]) state = 7; -- 1.9.3 From 5841d5b69f0df2f286c0a8e419deb16d927e864e Mon Sep 17 00:00:00 2001 From: popcornmix <popcornmix@gmail.com> Date: Mon, 19 Aug 2013 22:48:05 +0100 Subject: [PATCH 04/94] [ffmpeg] Backport of h264_find_start_code_candidate optimisation --- ...-Initialize-the-h264dsp-context-in-the-pa.patch | 39 +++ ...torize-code-into-a-new-function-h264_find.patch | 134 +++++++++ ...embly-version-of-h264_find_start_code_can.patch | 322 +++++++++++++++++++++ 3 files changed, 495 insertions(+) create mode 100644 lib/ffmpeg/patches/0056-h264_parser-Initialize-the-h264dsp-context-in-the-pa.patch create mode 100644 lib/ffmpeg/patches/0057-h264dsp-Factorize-code-into-a-new-function-h264_find.patch create mode 100644 lib/ffmpeg/patches/0058-arm-Add-assembly-version-of-h264_find_start_code_can.patch diff --git a/lib/ffmpeg/patches/0056-h264_parser-Initialize-the-h264dsp-context-in-the-pa.patch b/lib/ffmpeg/patches/0056-h264_parser-Initialize-the-h264dsp-context-in-the-pa.patch new file mode 100644 index 0000000..263578d --- /dev/null +++ b/lib/ffmpeg/patches/0056-h264_parser-Initialize-the-h264dsp-context-in-the-pa.patch @@ -0,0 +1,39 @@ +From 7a82022ee2f9b1fad991ace0936901e7419444be Mon Sep 17 00:00:00 2001 +From: Ben Avison <bavison@riscosopen.org> +Date: Mon, 5 Aug 2013 13:12:46 +0100 +Subject: [PATCH 1/3] h264_parser: Initialize the h264dsp context in the + parser as well +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Each AVStream struct for an H.264 elementary stream actually has two +copies of the H264DSPContext struct (and in fact all the other members +of H264Context as well): + +((H264Context *) ((AVStream *)st)->codec->priv_data)->h264dsp +((H264Context *) ((AVStream *)st)->parser->priv_data)->h264dsp + +but only the first of these was actually being initialised. This +prevented the addition of platform-specific implementations of +parser-related functions. + +Signed-off-by: Martin Storsjö <martin@martin.st> +--- + libavcodec/h264_parser.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c +index 2ed155c..da2a5f9 100644 +--- a/libavcodec/h264_parser.c ++++ b/libavcodec/h264_parser.c +@@ -417,6 +417,7 @@ static av_cold int init(AVCodecParserContext *s) + H264Context *h = s->priv_data; + h->thread_context[0] = h; + h->slice_context_count = 1; ++ ff_h264dsp_init(&h->h264dsp, 8, 1); + return 0; + } + +-- +1.7.9.5 diff --git a/lib/ffmpeg/patches/0057-h264dsp-Factorize-code-into-a-new-function-h264_find.patch b/lib/ffmpeg/patches/0057-h264dsp-Factorize-code-into-a-new-function-h264_find.patch new file mode 100644 index 0000000..0151d85 --- /dev/null +++ b/lib/ffmpeg/patches/0057-h264dsp-Factorize-code-into-a-new-function-h264_find.patch @@ -0,0 +1,134 @@ +From 218d6844b37d339ffbf2044ad07d8be7767e2734 Mon Sep 17 00:00:00 2001 +From: Ben Avison <bavison@riscosopen.org> +Date: Mon, 5 Aug 2013 13:12:47 +0100 +Subject: [PATCH 2/3] h264dsp: Factorize code into a new function, + h264_find_start_code_candidate +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This performs the start code search which was previously part of +h264_find_frame_end() - the most CPU intensive part of the function. + +By itself, this results in a performance regression: + Before After + Mean StdDev Mean StdDev Change +Overall time 2925.6 26.2 3068.5 31.7 -4.7% + +but this can more than be made up for by platform-optimised +implementations of the function. + +Signed-off-by: Martin Storsjö <martin@martin.st> +--- + libavcodec/h264_parser.c | 27 +++------------------------ + libavcodec/h264dsp.c | 29 +++++++++++++++++++++++++++++ + libavcodec/h264dsp.h | 9 +++++++++ + 3 files changed, 41 insertions(+), 24 deletions(-) + +diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c +index da2a5f9..ef5da98 100644 +--- a/libavcodec/h264_parser.c ++++ b/libavcodec/h264_parser.c +@@ -47,30 +47,9 @@ static int h264_find_frame_end(H264Context *h, const uint8_t *buf, + + for (i = 0; i < buf_size; i++) { + if (state == 7) { +-#if HAVE_FAST_UNALIGNED +- /* we check i < buf_size instead of i + 3 / 7 because it is +- * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE +- * bytes at the end. +- */ +-#if HAVE_FAST_64BIT +- while (i < buf_size && +- !((~*(const uint64_t *)(buf + i) & +- (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & +- 0x8080808080808080ULL)) +- i += 8; +-#else +- while (i < buf_size && +- !((~*(const uint32_t *)(buf + i) & +- (*(const uint32_t *)(buf + i) - 0x01010101U)) & +- 0x80808080U)) +- i += 4; +-#endif +-#endif +- for (; i < buf_size; i++) +- if (!buf[i]) { +- state = 2; +- break; +- } ++ i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i); ++ if (i < buf_size) ++ state = 2; + } else if (state <= 2) { + if (buf[i] == 1) + state ^= 5; // 2->7, 1->4, 0->5 +diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c +index 3ca6abe..a901dbb 100644 +--- a/libavcodec/h264dsp.c ++++ b/libavcodec/h264dsp.c +@@ -53,6 +53,34 @@ + #include "h264addpx_template.c" + #undef BIT_DEPTH + ++static int h264_find_start_code_candidate_c(const uint8_t *buf, int size) ++{ ++ int i = 0; ++#if HAVE_FAST_UNALIGNED ++ /* we check i < size instead of i + 3 / 7 because it is ++ * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE ++ * bytes at the end. ++ */ ++#if HAVE_FAST_64BIT ++ while (i < size && ++ !((~*(const uint64_t *)(buf + i) & ++ (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & ++ 0x8080808080808080ULL)) ++ i += 8; ++#else ++ while (i < size && ++ !((~*(const uint32_t *)(buf + i) & ++ (*(const uint32_t *)(buf + i) - 0x01010101U)) & ++ 0x80808080U)) ++ i += 4; ++#endif ++#endif ++ for (; i < size; i++) ++ if (!buf[i]) ++ break; ++ return i; ++} ++ + av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) + { +@@ -133,6 +161,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, + H264_DSP(8); + break; + } ++ c->h264_find_start_code_candidate = h264_find_start_code_candidate_c; + + if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); + if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); +diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h +index 1f9f8fe..6249ba7 100644 +--- a/libavcodec/h264dsp.h ++++ b/libavcodec/h264dsp.h +@@ -105,6 +105,15 @@ typedef struct H264DSPContext { + /* bypass-transform */ + void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); + void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); ++ ++ /** ++ * Search buf from the start for up to size bytes. Return the index ++ * of a zero byte, or >= size if not found. Ideally, use lookahead ++ * to filter out any zero bytes that are known to not be followed by ++ * one or more further zero bytes and a one byte. Better still, filter ++ * out any bytes that form the trailing_zero_8bits syntax element too. ++ */ ++ int (*h264_find_start_code_candidate)(const uint8_t *buf, int size); + } H264DSPContext; + + void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, +-- +1.7.9.5 diff --git a/lib/ffmpeg/patches/0058-arm-Add-assembly-version-of-h264_find_start_code_can.patch b/lib/ffmpeg/patches/0058-arm-Add-assembly-version-of-h264_find_start_code_can.patch new file mode 100644 index 0000000..cdc2d1e --- /dev/null +++ b/lib/ffmpeg/patches/0058-arm-Add-assembly-version-of-h264_find_start_code_can.patch @@ -0,0 +1,322 @@ +From 45e10e5c8d3df09c80a4d80483bff2712367f3fa Mon Sep 17 00:00:00 2001 +From: Ben Avison <bavison@riscosopen.org> +Date: Mon, 5 Aug 2013 13:12:48 +0100 +Subject: [PATCH 3/3] arm: Add assembly version of + h264_find_start_code_candidate +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + + Before After + Mean StdDev Mean StdDev Change +This function 508.8 23.4 185.4 9.0 +174.4% +Overall 3068.5 31.7 2752.1 29.4 +11.5% + +In combination with the preceding patch: + Before After + Mean StdDev Mean StdDev Change +Overall 2925.6 26.2 2752.1 29.4 +6.3% + +Signed-off-by: Martin Storsjö <martin@martin.st> +--- + libavcodec/arm/Makefile | 1 + + libavcodec/arm/h264dsp_armv6.S | 253 +++++++++++++++++++++++++++++++++++++ + libavcodec/arm/h264dsp_init_arm.c | 4 + + 3 files changed, 258 insertions(+) + create mode 100644 libavcodec/arm/h264dsp_armv6.S + +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index e941aaa..9c64b36 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -45,6 +45,7 @@ ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \ + arm/simple_idct_armv6.o \ + + ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o ++ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o + ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ + arm/hpeldsp_armv6.o + ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o +diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S +new file mode 100644 +index 0000000..c4f12a6 +--- /dev/null ++++ b/libavcodec/arm/h264dsp_armv6.S +@@ -0,0 +1,253 @@ ++/* ++ * Copyright (c) 2013 RISC OS Open Ltd ++ * Author: Ben Avison <bavison@riscosopen.org> ++ * ++ * This file is part of Libav. ++ * ++ * Libav is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * Libav is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with Libav; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++ ++RESULT .req a1 ++BUF .req a1 ++SIZE .req a2 ++PATTERN .req a3 ++PTR .req a4 ++DAT0 .req v1 ++DAT1 .req v2 ++DAT2 .req v3 ++DAT3 .req v4 ++TMP0 .req v5 ++TMP1 .req v6 ++TMP2 .req ip ++TMP3 .req lr ++ ++#define PRELOAD_DISTANCE 4 ++ ++.macro innerloop4 ++ ldr DAT0, [PTR], #4 ++ subs SIZE, SIZE, #4 @ C flag survives rest of macro ++ sub TMP0, DAT0, PATTERN, lsr #14 ++ bic TMP0, TMP0, DAT0 ++ ands TMP0, TMP0, PATTERN ++.endm ++ ++.macro innerloop16 decrement, do_preload ++ ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} ++ .ifnc "\do_preload","" ++ pld [PTR, #PRELOAD_DISTANCE*32] ++ .endif ++ .ifnc "\decrement","" ++ subs SIZE, SIZE, #\decrement @ C flag survives rest of macro ++ .endif ++ sub TMP0, DAT0, PATTERN, lsr #14 ++ sub TMP1, DAT1, PATTERN, lsr #14 ++ bic TMP0, TMP0, DAT0 ++ bic TMP1, TMP1, DAT1 ++ sub TMP2, DAT2, PATTERN, lsr #14 ++ sub TMP3, DAT3, PATTERN, lsr #14 ++ ands TMP0, TMP0, PATTERN ++ bic TMP2, TMP2, DAT2 ++ it eq ++ andseq TMP1, TMP1, PATTERN ++ bic TMP3, TMP3, DAT3 ++ itt eq ++ andseq TMP2, TMP2, PATTERN ++ andseq TMP3, TMP3, PATTERN ++.endm ++ ++/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */ ++function ff_h264_find_start_code_candidate_armv6, export=1 ++ push {v1-v6,lr} ++ mov PTR, BUF ++ @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go ++ @ before using code that does preloads ++ cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 ++ blo 60f ++ ++ @ Get to word-alignment, 1 byte at a time ++ tst PTR, #3 ++ beq 2f ++1: ldrb DAT0, [PTR], #1 ++ sub SIZE, SIZE, #1 ++ teq DAT0, #0 ++ beq 90f ++ tst PTR, #3 ++ bne 1b ++2: @ Get to 4-word alignment, 1 word at a time ++ ldr PATTERN, =0x80008000 ++ setend be ++ tst PTR, #12 ++ beq 4f ++3: innerloop4 ++ bne 91f ++ tst PTR, #12 ++ bne 3b ++4: @ Get to cacheline (8-word) alignment ++ tst PTR, #16 ++ beq 5f ++ innerloop16 16 ++ bne 93f ++5: @ Check complete cachelines, with preloading ++ @ We need to stop when there are still (PRELOAD_DISTANCE+1) ++ @ complete cachelines to go ++ sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 ++6: innerloop16 , do_preload ++ bne 93f ++ innerloop16 32 ++ bne 93f ++ bcs 6b ++ @ Preload trailing part-cacheline, if any ++ tst SIZE, #31 ++ beq 7f ++ pld [PTR, #(PRELOAD_DISTANCE+1)*32] ++ @ Check remaining data without doing any more preloads. First ++ @ do in chunks of 4 words: ++7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 ++ bmi 9f ++8: innerloop16 16 ++ bne 93f ++ bcs 8b ++ @ Then in words: ++9: adds SIZE, SIZE, #16 - 4 ++ bmi 11f ++10: innerloop4 ++ bne 91f ++ bcs 10b ++11: setend le ++ @ Check second byte of final halfword ++ ldrb DAT0, [PTR, #-1] ++ teq DAT0, #0 ++ beq 90f ++ @ Check any remaining bytes ++ tst SIZE, #3 ++ beq 13f ++12: ldrb DAT0, [PTR], #1 ++ sub SIZE, SIZE, #1 ++ teq DAT0, #0 ++ beq 90f ++ tst SIZE, #3 ++ bne 12b ++ @ No candidate found ++13: sub RESULT, PTR, BUF ++ b 99f ++ ++60: @ Small buffer - simply check by looping over bytes ++ subs SIZE, SIZE, #1 ++ bcc 99f ++61: ldrb DAT0, [PTR], #1 ++ subs SIZE, SIZE, #1 ++ teq DAT0, #0 ++ beq 90f ++ bcs 61b ++ @ No candidate found ++ sub RESULT, PTR, BUF ++ b 99f ++ ++90: @ Found a candidate at the preceding byte ++ sub RESULT, PTR, BUF ++ sub RESULT, RESULT, #1 ++ b 99f ++ ++91: @ Found a candidate somewhere in the preceding 4 bytes ++ sub RESULT, PTR, BUF ++ sub RESULT, RESULT, #4 ++ sub TMP0, DAT0, #0x20000 ++ bics TMP0, TMP0, DAT0 ++ itt pl ++ ldrbpl DAT0, [PTR, #-3] ++ addpl RESULT, RESULT, #2 ++ bpl 92f ++ teq RESULT, #0 ++ beq 98f @ don't look back a byte if found at first byte in buffer ++ ldrb DAT0, [PTR, #-5] ++92: teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++ ++93: @ Found a candidate somewhere in the preceding 16 bytes ++ sub RESULT, PTR, BUF ++ sub RESULT, RESULT, #16 ++ teq TMP0, #0 ++ beq 95f @ not in first 4 bytes ++ sub TMP0, DAT0, #0x20000 ++ bics TMP0, TMP0, DAT0 ++ itt pl ++ ldrbpl DAT0, [PTR, #-15] ++ addpl RESULT, RESULT, #2 ++ bpl 94f ++ teq RESULT, #0 ++ beq 98f @ don't look back a byte if found at first byte in buffer ++ ldrb DAT0, [PTR, #-17] ++94: teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++95: add RESULT, RESULT, #4 ++ teq TMP1, #0 ++ beq 96f @ not in next 4 bytes ++ sub TMP1, DAT1, #0x20000 ++ bics TMP1, TMP1, DAT1 ++ itee mi ++ ldrbmi DAT0, [PTR, #-13] ++ ldrbpl DAT0, [PTR, #-11] ++ addpl RESULT, RESULT, #2 ++ teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++96: add RESULT, RESULT, #4 ++ teq TMP2, #0 ++ beq 97f @ not in next 4 bytes ++ sub TMP2, DAT2, #0x20000 ++ bics TMP2, TMP2, DAT2 ++ itee mi ++ ldrbmi DAT0, [PTR, #-9] ++ ldrbpl DAT0, [PTR, #-7] ++ addpl RESULT, RESULT, #2 ++ teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++97: add RESULT, RESULT, #4 ++ sub TMP3, DAT3, #0x20000 ++ bics TMP3, TMP3, DAT3 ++ itee mi ++ ldrbmi DAT0, [PTR, #-5] ++ ldrbpl DAT0, [PTR, #-3] ++ addpl RESULT, RESULT, #2 ++ teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ @ drop through to 98f ++98: setend le ++99: pop {v1-v6,pc} ++.endfunc ++ ++ .unreq RESULT ++ .unreq BUF ++ .unreq SIZE ++ .unreq PATTERN ++ .unreq PTR ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq TMP0 ++ .unreq TMP1 ++ .unreq TMP2 ++ .unreq TMP3 +diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c +index bb8b3b9..b206a1b 100644 +--- a/libavcodec/arm/h264dsp_init_arm.c ++++ b/libavcodec/arm/h264dsp_init_arm.c +@@ -24,6 +24,8 @@ + #include "libavutil/arm/cpu.h" + #include "libavcodec/h264dsp.h" + ++int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); ++ + void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, +@@ -102,6 +104,8 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, + { + int cpu_flags = av_get_cpu_flags(); + ++ if (have_armv6(cpu_flags)) ++ c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; + if (have_neon(cpu_flags)) + h264dsp_init_neon(c, bit_depth, chroma_format_idc); + } +-- +1.7.9.5 -- 1.9.3 From db098a580259625bb7b7385a48cb0756aea5cafe Mon Sep 17 00:00:00 2001 From: Ben Avison <bavison@riscosopen.org> Date: Wed, 16 Apr 2014 01:51:31 +0100 Subject: [PATCH 05/94] h264: Move search code search functions into separate source files. This permits re-use with parsers for codecs which use similar start codes. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> --- lib/ffmpeg/libavcodec/Makefile | 2 +- lib/ffmpeg/libavcodec/arm/Makefile | 2 +- lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S | 253 --------------------------- lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c | 4 +- lib/ffmpeg/libavcodec/arm/startcode_armv6.S | 253 +++++++++++++++++++++++++++ lib/ffmpeg/libavcodec/h264dsp.c | 31 +--- lib/ffmpeg/libavcodec/startcode.c | 57 ++++++ lib/ffmpeg/libavcodec/startcode.h | 35 ++++ 8 files changed, 351 insertions(+), 286 deletions(-) delete mode 100644 lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S create mode 100644 lib/ffmpeg/libavcodec/arm/startcode_armv6.S create mode 100644 lib/ffmpeg/libavcodec/startcode.c create mode 100644 lib/ffmpeg/libavcodec/startcode.h diff --git a/lib/ffmpeg/libavcodec/Makefile b/lib/ffmpeg/libavcodec/Makefile index dc065a5..460f42c 100644 --- a/lib/ffmpeg/libavcodec/Makefile +++ b/lib/ffmpeg/libavcodec/Makefile @@ -49,7 +49,7 @@ OBJS-$(CONFIG_FFT) += avfft.o fft_fixed.o fft_float.o \ $(FFT-OBJS-yes) OBJS-$(CONFIG_GOLOMB) += golomb.o OBJS-$(CONFIG_H264CHROMA) += h264chroma.o -OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o +OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o startcode.o OBJS-$(CONFIG_H264PRED) += h264pred.o OBJS-$(CONFIG_H264QPEL) += h264qpel.o OBJS-$(CONFIG_HUFFMAN) += huffman.o diff --git a/lib/ffmpeg/libavcodec/arm/Makefile b/lib/ffmpeg/libavcodec/arm/Makefile index 480000b71..0b432e3 100644 --- a/lib/ffmpeg/libavcodec/arm/Makefile +++ b/lib/ffmpeg/libavcodec/arm/Makefile @@ -9,7 +9,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o \ OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o -ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o +ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ arm/flacdsp_arm.o \ diff --git a/lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S b/lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S deleted file mode 100644 index c4f12a6..0000000 --- a/lib/ffmpeg/libavcodec/arm/h264dsp_armv6.S +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2013 RISC OS Open Ltd - * Author: Ben Avison <bavison@riscosopen.org> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -RESULT .req a1 -BUF .req a1 -SIZE .req a2 -PATTERN .req a3 -PTR .req a4 -DAT0 .req v1 -DAT1 .req v2 -DAT2 .req v3 -DAT3 .req v4 -TMP0 .req v5 -TMP1 .req v6 -TMP2 .req ip -TMP3 .req lr - -#define PRELOAD_DISTANCE 4 - -.macro innerloop4 - ldr DAT0, [PTR], #4 - subs SIZE, SIZE, #4 @ C flag survives rest of macro - sub TMP0, DAT0, PATTERN, lsr #14 - bic TMP0, TMP0, DAT0 - ands TMP0, TMP0, PATTERN -.endm - -.macro innerloop16 decrement, do_preload - ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} - .ifnc "\do_preload","" - pld [PTR, #PRELOAD_DISTANCE*32] - .endif - .ifnc "\decrement","" - subs SIZE, SIZE, #\decrement @ C flag survives rest of macro - .endif - sub TMP0, DAT0, PATTERN, lsr #14 - sub TMP1, DAT1, PATTERN, lsr #14 - bic TMP0, TMP0, DAT0 - bic TMP1, TMP1, DAT1 - sub TMP2, DAT2, PATTERN, lsr #14 - sub TMP3, DAT3, PATTERN, lsr #14 - ands TMP0, TMP0, PATTERN - bic TMP2, TMP2, DAT2 - it eq - andseq TMP1, TMP1, PATTERN - bic TMP3, TMP3, DAT3 - itt eq - andseq TMP2, TMP2, PATTERN - andseq TMP3, TMP3, PATTERN -.endm - -/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */ -function ff_h264_find_start_code_candidate_armv6, export=1 - push {v1-v6,lr} - mov PTR, BUF - @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go - @ before using code that does preloads - cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 - blo 60f - - @ Get to word-alignment, 1 byte at a time - tst PTR, #3 - beq 2f -1: ldrb DAT0, [PTR], #1 - sub SIZE, SIZE, #1 - teq DAT0, #0 - beq 90f - tst PTR, #3 - bne 1b -2: @ Get to 4-word alignment, 1 word at a time - ldr PATTERN, =0x80008000 - setend be - tst PTR, #12 - beq 4f -3: innerloop4 - bne 91f - tst PTR, #12 - bne 3b -4: @ Get to cacheline (8-word) alignment - tst PTR, #16 - beq 5f - innerloop16 16 - bne 93f -5: @ Check complete cachelines, with preloading - @ We need to stop when there are still (PRELOAD_DISTANCE+1) - @ complete cachelines to go - sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 -6: innerloop16 , do_preload - bne 93f - innerloop16 32 - bne 93f - bcs 6b - @ Preload trailing part-cacheline, if any - tst SIZE, #31 - beq 7f - pld [PTR, #(PRELOAD_DISTANCE+1)*32] - @ Check remaining data without doing any more preloads. First - @ do in chunks of 4 words: -7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 - bmi 9f -8: innerloop16 16 - bne 93f - bcs 8b - @ Then in words: -9: adds SIZE, SIZE, #16 - 4 - bmi 11f -10: innerloop4 - bne 91f - bcs 10b -11: setend le - @ Check second byte of final halfword - ldrb DAT0, [PTR, #-1] - teq DAT0, #0 - beq 90f - @ Check any remaining bytes - tst SIZE, #3 - beq 13f -12: ldrb DAT0, [PTR], #1 - sub SIZE, SIZE, #1 - teq DAT0, #0 - beq 90f - tst SIZE, #3 - bne 12b - @ No candidate found -13: sub RESULT, PTR, BUF - b 99f - -60: @ Small buffer - simply check by looping over bytes - subs SIZE, SIZE, #1 - bcc 99f -61: ldrb DAT0, [PTR], #1 - subs SIZE, SIZE, #1 - teq DAT0, #0 - beq 90f - bcs 61b - @ No candidate found - sub RESULT, PTR, BUF - b 99f - -90: @ Found a candidate at the preceding byte - sub RESULT, PTR, BUF - sub RESULT, RESULT, #1 - b 99f - -91: @ Found a candidate somewhere in the preceding 4 bytes - sub RESULT, PTR, BUF - sub RESULT, RESULT, #4 - sub TMP0, DAT0, #0x20000 - bics TMP0, TMP0, DAT0 - itt pl - ldrbpl DAT0, [PTR, #-3] - addpl RESULT, RESULT, #2 - bpl 92f - teq RESULT, #0 - beq 98f @ don't look back a byte if found at first byte in buffer - ldrb DAT0, [PTR, #-5] -92: teq DAT0, #0 - it eq - subeq RESULT, RESULT, #1 - b 98f - -93: @ Found a candidate somewhere in the preceding 16 bytes - sub RESULT, PTR, BUF - sub RESULT, RESULT, #16 - teq TMP0, #0 - beq 95f @ not in first 4 bytes - sub TMP0, DAT0, #0x20000 - bics TMP0, TMP0, DAT0 - itt pl - ldrbpl DAT0, [PTR, #-15] - addpl RESULT, RESULT, #2 - bpl 94f - teq RESULT, #0 - beq 98f @ don't look back a byte if found at first byte in buffer - ldrb DAT0, [PTR, #-17] -94: teq DAT0, #0 - it eq - subeq RESULT, RESULT, #1 - b 98f -95: add RESULT, RESULT, #4 - teq TMP1, #0 - beq 96f @ not in next 4 bytes - sub TMP1, DAT1, #0x20000 - bics TMP1, TMP1, DAT1 - itee mi - ldrbmi DAT0, [PTR, #-13] - ldrbpl DAT0, [PTR, #-11] - addpl RESULT, RESULT, #2 - teq DAT0, #0 - it eq - subeq RESULT, RESULT, #1 - b 98f -96: add RESULT, RESULT, #4 - teq TMP2, #0 - beq 97f @ not in next 4 bytes - sub TMP2, DAT2, #0x20000 - bics TMP2, TMP2, DAT2 - itee mi - ldrbmi DAT0, [PTR, #-9] - ldrbpl DAT0, [PTR, #-7] - addpl RESULT, RESULT, #2 - teq DAT0, #0 - it eq - subeq RESULT, RESULT, #1 - b 98f -97: add RESULT, RESULT, #4 - sub TMP3, DAT3, #0x20000 - bics TMP3, TMP3, DAT3 - itee mi - ldrbmi DAT0, [PTR, #-5] - ldrbpl DAT0, [PTR, #-3] - addpl RESULT, RESULT, #2 - teq DAT0, #0 - it eq - subeq RESULT, RESULT, #1 - @ drop through to 98f -98: setend le -99: pop {v1-v6,pc} -.endfunc - - .unreq RESULT - .unreq BUF - .unreq SIZE - .unreq PATTERN - .unreq PTR - .unreq DAT0 - .unreq DAT1 - .unreq DAT2 - .unreq DAT3 - .unreq TMP0 - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 diff --git a/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c b/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c index 2804e56..842fb9f 100644 --- a/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c +++ b/lib/ffmpeg/libavcodec/arm/h264dsp_init_arm.c @@ -24,7 +24,7 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/h264dsp.h" -int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); +int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size); void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); @@ -109,7 +109,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, int cpu_flags = av_get_cpu_flags(); if (have_armv6(cpu_flags)) - c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; + c->h264_find_start_code_candidate = ff_startcode_find_candidate_armv6; if (have_neon(cpu_flags)) ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc); } diff --git a/lib/ffmpeg/libavcodec/arm/startcode_armv6.S b/lib/ffmpeg/libavcodec/arm/startcode_armv6.S new file mode 100644 index 0000000..a46f009 --- /dev/null +++ b/lib/ffmpeg/libavcodec/arm/startcode_armv6.S @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison <bavison@riscosopen.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +RESULT .req a1 +BUF .req a1 +SIZE .req a2 +PATTERN .req a3 +PTR .req a4 +DAT0 .req v1 +DAT1 .req v2 +DAT2 .req v3 +DAT3 .req v4 +TMP0 .req v5 +TMP1 .req v6 +TMP2 .req ip +TMP3 .req lr + +#define PRELOAD_DISTANCE 4 + +.macro innerloop4 + ldr DAT0, [PTR], #4 + subs SIZE, SIZE, #4 @ C flag survives rest of macro + sub TMP0, DAT0, PATTERN, lsr #14 + bic TMP0, TMP0, DAT0 + ands TMP0, TMP0, PATTERN +.endm + +.macro innerloop16 decrement, do_preload + ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} + .ifnc "\do_preload","" + pld [PTR, #PRELOAD_DISTANCE*32] + .endif + .ifnc "\decrement","" + subs SIZE, SIZE, #\decrement @ C flag survives rest of macro + .endif + sub TMP0, DAT0, PATTERN, lsr #14 + sub TMP1, DAT1, PATTERN, lsr #14 + bic TMP0, TMP0, DAT0 + bic TMP1, TMP1, DAT1 + sub TMP2, DAT2, PATTERN, lsr #14 + sub TMP3, DAT3, PATTERN, lsr #14 + ands TMP0, TMP0, PATTERN + bic TMP2, TMP2, DAT2 + it eq + andseq TMP1, TMP1, PATTERN + bic TMP3, TMP3, DAT3 + itt eq + andseq TMP2, TMP2, PATTERN + andseq TMP3, TMP3, PATTERN +.endm + +/* int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size) */ +function ff_startcode_find_candidate_armv6, export=1 + push {v1-v6,lr} + mov PTR, BUF + @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go + @ before using code that does preloads + cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 + blo 60f + + @ Get to word-alignment, 1 byte at a time + tst PTR, #3 + beq 2f +1: ldrb DAT0, [PTR], #1 + sub SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + tst PTR, #3 + bne 1b +2: @ Get to 4-word alignment, 1 word at a time + ldr PATTERN, =0x80008000 + setend be + tst PTR, #12 + beq 4f +3: innerloop4 + bne 91f + tst PTR, #12 + bne 3b +4: @ Get to cacheline (8-word) alignment + tst PTR, #16 + beq 5f + innerloop16 16 + bne 93f +5: @ Check complete cachelines, with preloading + @ We need to stop when there are still (PRELOAD_DISTANCE+1) + @ complete cachelines to go + sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 +6: innerloop16 , do_preload + bne 93f + innerloop16 32 + bne 93f + bcs 6b + @ Preload trailing part-cacheline, if any + tst SIZE, #31 + beq 7f + pld [PTR, #(PRELOAD_DISTANCE+1)*32] + @ Check remaining data without doing any more preloads. First + @ do in chunks of 4 words: +7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 + bmi 9f +8: innerloop16 16 + bne 93f + bcs 8b + @ Then in words: +9: adds SIZE, SIZE, #16 - 4 + bmi 11f +10: innerloop4 + bne 91f + bcs 10b +11: setend le + @ Check second byte of final halfword + ldrb DAT0, [PTR, #-1] + teq DAT0, #0 + beq 90f + @ Check any remaining bytes + tst SIZE, #3 + beq 13f +12: ldrb DAT0, [PTR], #1 + sub SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + tst SIZE, #3 + bne 12b + @ No candidate found +13: sub RESULT, PTR, BUF + b 99f + +60: @ Small buffer - simply check by looping over bytes + subs SIZE, SIZE, #1 + bcc 99f +61: ldrb DAT0, [PTR], #1 + subs SIZE, SIZE, #1 + teq DAT0, #0 + beq 90f + bcs 61b + @ No candidate found + sub RESULT, PTR, BUF + b 99f + +90: @ Found a candidate at the preceding byte + sub RESULT, PTR, BUF + sub RESULT, RESULT, #1 + b 99f + +91: @ Found a candidate somewhere in the preceding 4 bytes + sub RESULT, PTR, BUF + sub RESULT, RESULT, #4 + sub TMP0, DAT0, #0x20000 + bics TMP0, TMP0, DAT0 + itt pl + ldrbpl DAT0, [PTR, #-3] + addpl RESULT, RESULT, #2 + bpl 92f + teq RESULT, #0 + beq 98f @ don't look back a byte if found at first byte in buffer + ldrb DAT0, [PTR, #-5] +92: teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f + +93: @ Found a candidate somewhere in the preceding 16 bytes + sub RESULT, PTR, BUF + sub RESULT, RESULT, #16 + teq TMP0, #0 + beq 95f @ not in first 4 bytes + sub TMP0, DAT0, #0x20000 + bics TMP0, TMP0, DAT0 + itt pl + ldrbpl DAT0, [PTR, #-15] + addpl RESULT, RESULT, #2 + bpl 94f + teq RESULT, #0 + beq 98f @ don't look back a byte if found at first byte in buffer + ldrb DAT0, [PTR, #-17] +94: teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +95: add RESULT, RESULT, #4 + teq TMP1, #0 + beq 96f @ not in next 4 bytes + sub TMP1, DAT1, #0x20000 + bics TMP1, TMP1, DAT1 + itee mi + ldrbmi DAT0, [PTR, #-13] + ldrbpl DAT0, [PTR, #-11] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +96: add RESULT, RESULT, #4 + teq TMP2, #0 + beq 97f @ not in next 4 bytes + sub TMP2, DAT2, #0x20000 + bics TMP2, TMP2, DAT2 + itee mi + ldrbmi DAT0, [PTR, #-9] + ldrbpl DAT0, [PTR, #-7] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + b 98f +97: add RESULT, RESULT, #4 + sub TMP3, DAT3, #0x20000 + bics TMP3, TMP3, DAT3 + itee mi + ldrbmi DAT0, [PTR, #-5] + ldrbpl DAT0, [PTR, #-3] + addpl RESULT, RESULT, #2 + teq DAT0, #0 + it eq + subeq RESULT, RESULT, #1 + @ drop through to 98f +98: setend le +99: pop {v1-v6,pc} +endfunc + + .unreq RESULT + .unreq BUF + .unreq SIZE + .unreq PATTERN + .unreq PTR + .unreq DAT0 + .unreq DAT1 + .unreq DAT2 + .unreq DAT3 + .unreq TMP0 + .unreq TMP1 + .unreq TMP2 + .unreq TMP3 diff --git a/lib/ffmpeg/libavcodec/h264dsp.c b/lib/ffmpeg/libavcodec/h264dsp.c index b7d61cd..a84ae59 100644 --- a/lib/ffmpeg/libavcodec/h264dsp.c +++ b/lib/ffmpeg/libavcodec/h264dsp.c @@ -30,6 +30,7 @@ #include "avcodec.h" #include "h264dsp.h" #include "h264idct.h" +#include "startcode.h" #include "libavutil/common.h" #define BIT_DEPTH 8 @@ -60,34 +61,6 @@ #include "h264addpx_template.c" #undef BIT_DEPTH -static int h264_find_start_code_candidate_c(const uint8_t *buf, int size) -{ - int i = 0; -#if HAVE_FAST_UNALIGNED - /* we check i < size instead of i + 3 / 7 because it is - * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE - * bytes at the end. - */ -#if HAVE_FAST_64BIT - while (i < size && - !((~*(const uint64_t *)(buf + i) & - (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & - 0x8080808080808080ULL)) - i += 8; -#else - while (i < size && - !((~*(const uint32_t *)(buf + i) & - (*(const uint32_t *)(buf + i) - 0x01010101U)) & - 0x80808080U)) - i += 4; -#endif -#endif - for (; i < size; i++) - if (!buf[i]) - break; - return i; -} - void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { #undef FUNC @@ -174,7 +147,7 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo H264_DSP(8); break; } - c->h264_find_start_code_candidate = h264_find_start_code_candidate_c; + c->h264_find_start_code_candidate = ff_startcode_find_candidate_c; if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); diff --git a/lib/ffmpeg/libavcodec/startcode.c b/lib/ffmpeg/libavcodec/startcode.c new file mode 100644 index 0000000..5df7695 --- /dev/null +++ b/lib/ffmpeg/libavcodec/startcode.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Accelerated start code search function for start codes common to + * MPEG-1/2/4 video, VC-1, H.264/5 + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#include "startcode.h" +#include "config.h" + +int ff_startcode_find_candidate_c(const uint8_t *buf, int size) +{ + int i = 0; +#if HAVE_FAST_UNALIGNED + /* we check i < size instead of i + 3 / 7 because it is + * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE + * bytes at the end. + */ +# if HAVE_FAST_64BIT + while (i < size && + !((~*(const uint64_t *)(buf + i) & + (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & + 0x8080808080808080ULL)) + i += 8; +# else + while (i < size && + !((~*(const uint32_t *)(buf + i) & + (*(const uint32_t *)(buf + i) - 0x01010101U)) & + 0x80808080U)) + i += 4; +# endif +#endif + for (; i < size; i++) + if (!buf[i]) + break; + return i; +} diff --git a/lib/ffmpeg/libavcodec/startcode.h b/lib/ffmpeg/libavcodec/startcode.h new file mode 100644 index 0000000..cc55d5f --- /dev/null +++ b/lib/ffmpeg/libavcodec/startcode.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Accelerated start code search function for start codes common to + * MPEG-1/2/4 video, VC-1, H.264/5 + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVCODEC_STARTCODE_H +#define AVCODEC_STARTCODE_H + +#include <stdint.h> + +int ff_startcode_find_candidate_c(const uint8_t *buf, int size); + +#endif /* AVCODEC_STARTCODE_H */ -- 1.9.3 From 7d95eb8e026582e5446e7e11d75ba999286a34d0 Mon Sep 17 00:00:00 2001 From: Ben Avison <bavison@riscosopen.org> Date: Wed, 16 Apr 2014 01:51:32 +0100 Subject: [PATCH 06/94] vc-1: Add platform-specific start code search routine to VC1DSPContext. Initialise VC1DSPContext for parser as well as for decoder. Note, the VC-1 code doesn't actually use the function pointer yet. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> --- lib/ffmpeg/libavcodec/Makefile | 7 +++--- lib/ffmpeg/libavcodec/arm/Makefile | 3 +++ lib/ffmpeg/libavcodec/arm/vc1dsp_init_arm.c | 33 +++++++++++++++++++++++++++++ lib/ffmpeg/libavcodec/vc1.c | 2 ++ lib/ffmpeg/libavcodec/vc1dec.c | 1 - lib/ffmpeg/libavcodec/vc1dsp.c | 5 +++++ lib/ffmpeg/libavcodec/vc1dsp.h | 9 ++++++++ 7 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 lib/ffmpeg/libavcodec/arm/vc1dsp_init_arm.c diff --git a/lib/ffmpeg/libavcodec/Makefile b/lib/ffmpeg/libavcodec/Makefile index 460f42c..8d8a548 100644 --- a/lib/ffmpeg/libavcodec/Makefile +++ b/lib/ffmpeg/libavcodec/Makefile @@ -455,7 +455,7 @@ OBJS-$(CONFIG_VB_DECODER) += vb.o OBJS-$(CONFIG_VBLE_DECODER) += vble.o OBJS-$(CONFIG_VC1_DECODER) += vc1dec.o vc1.o vc1data.o vc1dsp.o \ msmpeg4.o msmpeg4data.o \ - intrax8.o intrax8dsp.o + intrax8.o intrax8dsp.o startcode.o OBJS-$(CONFIG_VC1_DXVA2_HWACCEL) += dxva2_vc1.o OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o @@ -487,6 +487,7 @@ OBJS-$(CONFIG_WMAVOICE_DECODER) += wmavoice.o \ celp_filters.o \ acelp_vectors.o acelp_filters.o OBJS-$(CONFIG_WMV1_DECODER) += msmpeg4.o msmpeg4data.o + OBJS-$(CONFIG_WMV2_DECODER) += wmv2dec.o wmv2.o wmv2dsp.o \ msmpeg4.o msmpeg4data.o \ intrax8.o intrax8dsp.o @@ -746,9 +747,9 @@ OBJS-$(CONFIG_PNM_PARSER) += pnm_parser.o pnm.o OBJS-$(CONFIG_RV30_PARSER) += rv34_parser.o OBJS-$(CONFIG_RV40_PARSER) += rv34_parser.o OBJS-$(CONFIG_TAK_PARSER) += tak_parser.o tak.o -OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o \ +OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o vc1dsp.o \ msmpeg4.o msmpeg4data.o mpeg4video.o \ - h263.o + h263.o startcode.o OBJS-$(CONFIG_VORBIS_PARSER) += vorbis_parser.o xiph.o OBJS-$(CONFIG_VP3_PARSER) += vp3_parser.o OBJS-$(CONFIG_VP8_PARSER) += vp8_parser.o diff --git a/lib/ffmpeg/libavcodec/arm/Makefile b/lib/ffmpeg/libavcodec/arm/Makefile index 0b432e3..715eed7 100644 --- a/lib/ffmpeg/libavcodec/arm/Makefile +++ b/lib/ffmpeg/libavcodec/arm/Makefile @@ -16,6 +16,9 @@ OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o