diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 84e2220fb7..71ed4e64d8 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -112,8 +112,7 @@ typedef struct VP9Context {
     uint8_t invisible;
     uint8_t use_last_frame_mvs;
     uint8_t errorres;
-    uint8_t colorspace;
-    uint8_t fullrange;
+    uint8_t ss_h, ss_v;
     uint8_t intraonly;
     uint8_t resetctx;
     uint8_t refreshrefmask;
@@ -216,7 +215,7 @@ typedef struct VP9Context {
     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
-    DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
+    DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
@@ -249,8 +248,8 @@ typedef struct VP9Context {
     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
     struct { int x, y; } min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64];
     uint16_t mvscale[3][2];
     uint8_t mvstep[3][2];
 } VP9Context;
@@ -308,39 +307,42 @@ static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
     return 0;
 }
 
-static int update_size(AVCodecContext *ctx, int w, int h)
+static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
 {
     VP9Context *s = ctx->priv_data;
     uint8_t *p;
 
     av_assert0(w > 0 && h > 0);
 
-    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
+    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
         return 0;
 
-    ctx->width  = w;
-    ctx->height = h;
-    s->sb_cols  = (w + 63) >> 6;
-    s->sb_rows  = (h + 63) >> 6;
-    s->cols     = (w + 7) >> 3;
-    s->rows     = (h + 7) >> 3;
+    ctx->width   = w;
+    ctx->height  = h;
+    ctx->pix_fmt = fmt;
+    s->sb_cols   = (w + 63) >> 6;
+    s->sb_rows   = (h + 63) >> 6;
+    s->cols      = (w + 7) >> 3;
+    s->rows      = (h + 7) >> 3;
 
 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
     av_freep(&s->intra_pred_data[0]);
-    p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+    // FIXME we slightly over-allocate here for subsampled chroma, but a little
+    // bit of padding shouldn't affect performance...
+    p = av_malloc(s->sb_cols * (320 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
     assign(s->intra_pred_data[0],  uint8_t *,             64);
-    assign(s->intra_pred_data[1],  uint8_t *,             32);
-    assign(s->intra_pred_data[2],  uint8_t *,             32);
+    assign(s->intra_pred_data[1],  uint8_t *,             64);
+    assign(s->intra_pred_data[2],  uint8_t *,             64);
     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
     assign(s->above_mode_ctx,      uint8_t *,             16);
     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
     assign(s->above_partition_ctx, uint8_t *,              8);
     assign(s->above_skip_ctx,      uint8_t *,              8);
     assign(s->above_txfm_ctx,      uint8_t *,              8);
-    assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
-    assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
     assign(s->above_segpred_ctx,   uint8_t *,              8);
     assign(s->above_intra_ctx,     uint8_t *,              8);
     assign(s->above_comp_ctx,      uint8_t *,              8);
@@ -359,34 +361,39 @@ static int update_size(AVCodecContext *ctx, int w, int h)
 static int update_block_buffers(AVCodecContext *ctx)
 {
     VP9Context *s = ctx->priv_data;
+    int chroma_blocks, chroma_eobs;
 
     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
         return 0;
 
     av_free(s->b_base);
     av_free(s->block_base);
+    chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
+    chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
     if (s->frames[CUR_FRAME].uses_2pass) {
         int sbs = s->sb_cols * s->sb_rows;
 
         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
+        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+                                    16 * 16 + 2 * chroma_eobs) * sbs);
         if (!s->b_base || !s->block_base)
             return AVERROR(ENOMEM);
         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
-        s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
-        s->uveob_base[0] = s->eob_base + 256 * sbs;
-        s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
+        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks);
+        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
     } else {
         s->b_base = av_malloc(sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 128) * 3);
+        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+                                   16 * 16 + 2 * chroma_eobs);
         if (!s->b_base || !s->block_base)
             return AVERROR(ENOMEM);
         s->uvblock_base[0] = s->block_base + 64 * 64;
-        s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
-        s->uveob_base[0] = s->eob_base + 256;
-        s->uveob_base[1] = s->uveob_base[0] + 64;
+        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks);
+        s->uveob_base[0] = s->eob_base + 16 * 16;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
     }
     s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
 
@@ -463,11 +470,56 @@ static int update_prob(VP56RangeCoder *c, int p)
                     255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
 }
 
+static enum AVPixelFormat read_colorspace_details(AVCodecContext *ctx)
+{
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+    VP9Context *s = ctx->priv_data;
+    enum AVPixelFormat res;
+
+    ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
+    if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
+        if (s->profile == 1) {
+            s->ss_h = s->ss_v = 1;
+            res = AV_PIX_FMT_GBRP;
+            ctx->color_range = AVCOL_RANGE_JPEG;
+        } else {
+            av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        static const enum AVPixelFormat pix_fmt_for_ss[2 /* v */][2 /* h */] = {
+            { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
+            { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P },
+        };
+        ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+        if (s->profile == 1) {
+            s->ss_h = get_bits1(&s->gb);
+            s->ss_v = get_bits1(&s->gb);
+            if ((res = pix_fmt_for_ss[s->ss_v][s->ss_h]) == AV_PIX_FMT_YUV420P) {
+                av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile 1\n");
+                return AVERROR_INVALIDDATA;
+            } else if (get_bits1(&s->gb)) {
+                av_log(ctx, AV_LOG_ERROR, "Profile 1 color details reserved bit set\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            s->ss_h = s->ss_v = 1;
+            res = AV_PIX_FMT_YUV420P;
+        }
+    }
+
+    return res;
+}
+
 static int decode_frame_header(AVCodecContext *ctx,
                                const uint8_t *data, int size, int *ref)
 {
     VP9Context *s = ctx->priv_data;
     int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
+    enum AVPixelFormat fmt = ctx->pix_fmt;
     int last_invisible;
     const uint8_t *data2;
 
@@ -481,8 +533,9 @@ static int decode_frame_header(AVCodecContext *ctx,
         return AVERROR_INVALIDDATA;
     }
     s->profile = get_bits1(&s->gb);
-    if (get_bits1(&s->gb)) { // reserved bit
-        av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
+    s->profile |= get_bits1(&s->gb) << 1;
+    if (s->profile > 1) {
+        av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", s->profile);
         return AVERROR_INVALIDDATA;
     }
     if (get_bits1(&s->gb)) {
@@ -500,12 +553,8 @@ static int decode_frame_header(AVCodecContext *ctx,
             av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
             return AVERROR_INVALIDDATA;
         }
-        s->colorspace = get_bits(&s->gb, 3);
-        if (s->colorspace == 7) { // RGB = profile 1
-            av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->fullrange  = get_bits1(&s->gb);
+        if ((fmt = read_colorspace_details(ctx)) < 0)
+            return fmt;
         // for profile 1, here follows the subsampling bits
         s->refreshrefmask = 0xff;
         w = get_bits(&s->gb, 16) + 1;
@@ -520,6 +569,15 @@ static int decode_frame_header(AVCodecContext *ctx,
                 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (s->profile == 1) {
+                if ((fmt = read_colorspace_details(ctx)) < 0)
+                    return fmt;
+            } else {
+                s->ss_h = s->ss_v = 1;
+                fmt = AV_PIX_FMT_YUV420P;
+                ctx->colorspace = AVCOL_SPC_BT470BG;
+                ctx->color_range = AVCOL_RANGE_JPEG;
+            }
             s->refreshrefmask = get_bits(&s->gb, 8);
             w = get_bits(&s->gb, 16) + 1;
             h = get_bits(&s->gb, 16) + 1;
@@ -722,8 +780,8 @@ static int decode_frame_header(AVCodecContext *ctx,
     }
 
     /* tiling info */
-    if ((res = update_size(ctx, w, h)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
+    if ((res = update_size(ctx, w, h, fmt)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
         return res;
     }
     for (s->tiling.log2_tile_cols = 0;
@@ -2279,12 +2337,12 @@ static void decode_coeffs(AVCodecContext *ctx)
         break;
     }
 
-#define DECODE_UV_COEF_LOOP(step) \
+#define DECODE_UV_COEF_LOOP(step, decode_coeffs_fn) \
     for (n = 0, y = 0; y < end_y; y += step) { \
         for (x = 0; x < end_x; x += step, n += step * step) { \
-            res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
-                                  16 * step * step, c, e, p, a[x] + l[y], \
-                                  uvscan, uvnb, uv_band_counts, qmul[1]); \
+            res = decode_coeffs_fn(&s->c, s->uvblock[pl] + 16 * n, \
+                                   16 * step * step, c, e, p, a[x] + l[y], \
+                                   uvscan, uvnb, uv_band_counts, qmul[1]); \
             a[x] = l[y] = !!res; \
             if (step >= 4) { \
                 AV_WN16A(&s->uveob[pl][n], res); \
@@ -2297,36 +2355,30 @@ static void decode_coeffs(AVCodecContext *ctx)
     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
     c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
     e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
-    w4 >>= 1;
-    h4 >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    h4 >>= s->ss_v;
+    end_y >>= s->ss_v;
     for (pl = 0; pl < 2; pl++) {
-        a = &s->above_uv_nnz_ctx[pl][col];
-        l = &s->left_uv_nnz_ctx[pl][row & 7];
+        a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
+        l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
         switch (b->uvtx) {
         case TX_4X4:
-            DECODE_UV_COEF_LOOP(1);
+            DECODE_UV_COEF_LOOP(1, decode_coeffs_b);
             break;
         case TX_8X8:
             MERGE_CTX(2, AV_RN16A);
-            DECODE_UV_COEF_LOOP(2);
+            DECODE_UV_COEF_LOOP(2, decode_coeffs_b);
             SPLAT_CTX(2);
             break;
         case TX_16X16:
             MERGE_CTX(4, AV_RN32A);
-            DECODE_UV_COEF_LOOP(4);
+            DECODE_UV_COEF_LOOP(4, decode_coeffs_b);
             SPLAT_CTX(4);
             break;
         case TX_32X32:
             MERGE_CTX(8, AV_RN64A);
-            // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
-            // so there is no need to loop
-            res = decode_coeffs_b32(&s->c, s->uvblock[pl],
-                                    1024, c, e, p, a[0] + l[0],
-                                    uvscan, uvnb, uv_band_counts, qmul[1]);
-            a[0] = l[0] = !!res;
-            AV_WN16A(&s->uveob[pl][0], res);
+            DECODE_UV_COEF_LOOP(8, decode_coeffs_b32);
             SPLAT_CTX(8);
             break;
         }
@@ -2338,7 +2390,7 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
                                              uint8_t *dst_inner, ptrdiff_t stride_inner,
                                              uint8_t *l, int col, int x, int w,
                                              int row, int y, enum TxfmMode tx,
-                                             int p)
+                                             int p, int ss_h, int ss_v)
 {
     int have_top = row > 0 || y > 0;
     int have_left = col > s->tiling.tile_col_start || x > 0;
@@ -2393,7 +2445,7 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
     mode = mode_conv[mode][have_left][have_top];
     if (edges[mode].needs_top) {
         uint8_t *top, *topleft;
-        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
+        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
         int n_px_need_tr = 0;
 
         if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
@@ -2404,11 +2456,11 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
         // post-loopfilter data)
         if (have_top) {
             top = !(row & 7) && !y ?
-                s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
+                s->intra_pred_data[p] + col * (8 >> ss_h) + x * 4 :
                 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
             if (have_left)
                 topleft = !(row & 7) && !y ?
-                    s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
+                    s->intra_pred_data[p] + col * (8 >> ss_h) + x * 4 :
                     y == 0 || x == 0 ? &dst_edge[-stride_edge] :
                     &dst_inner[-stride_inner];
         }
@@ -2449,7 +2501,7 @@ static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **
     }
     if (edges[mode].needs_left) {
         if (have_left) {
-            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
+            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
             uint8_t *dst = x == 0 ? dst_edge : dst_inner;
             ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
 
@@ -2508,7 +2560,7 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
             mode = check_intra_mode(s, mode, &a, ptr_r,
                                     s->frames[CUR_FRAME].tf.f->linesize[0],
                                     ptr, s->y_stride, l,
-                                    col, x, w4, row, y, b->tx, 0);
+                                    col, x, w4, row, y, b->tx, 0, 0, 0);
             s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
             if (eob)
                 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
@@ -2519,9 +2571,9 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
     }
 
     // U/V
-    w4 >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    end_y >>= s->ss_v;
     step = 1 << (b->uvtx * 2);
     for (p = 0; p < 2; p++) {
         dst   = s->dst[1 + p];
@@ -2536,8 +2588,8 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
 
                 mode = check_intra_mode(s, mode, &a, ptr_r,
                                         s->frames[CUR_FRAME].tf.f->linesize[1],
-                                        ptr, s->uv_stride, l,
-                                        col, x, w4, row, y, b->uvtx, p + 1);
+                                        ptr, s->uv_stride, l, col, x, w4, row, y,
+                                        b->uvtx, p + 1, s->ss_h, s->ss_v);
                 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
                 if (eob)
                     s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
@@ -2557,7 +2609,7 @@ static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func sm
                                             int bw, int bh, int w, int h,
                                             const uint16_t *scale, const uint8_t *step)
 {
-#define scale_mv(n, dim) (((int64_t)n * scale[dim]) >> 14)
+#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
     // BUG libvpx seems to scale the two components separately. This introduces
     // rounding errors but we have to reproduce them to be exactly compatible
     // with the output from libvpx...
@@ -2601,8 +2653,8 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
                                               const uint16_t *scale, const uint8_t *step)
 {
     // BUG https://code.google.com/p/webm/issues/detail?id=820
-    int mx = scale_mv(mv->x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
-    int my = scale_mv(mv->y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+    int mx = scale_mv(mv->x << !s->ss_h, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    int my = scale_mv(mv->y << !s->ss_v, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
 #undef scale_mv
     int refbw_m1, refbh_m1;
     int th;
@@ -2618,7 +2670,7 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
     // we use +7 because the last 7 pixels of each sbrow can be changed in
     // the longest loopfilter of the next sbrow
-    th = (y + refbh_m1 + 4 + 7) >> 5;
+    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
     if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
@@ -2696,7 +2748,7 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
                                                 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
                                                 int bw, int bh, int w, int h)
 {
-    int mx = mv->x, my = mv->y, th;
+    int mx = mv->x << !s->ss_h, my = mv->y << !s->ss_v, th;
 
     y += my >> 4;
     x += mx >> 4;
@@ -2707,7 +2759,7 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
     // FIXME bilinear filter only needs 0/1 pixels, not 3/4
     // we use +7 because the last 7 pixels of each sbrow can be changed in
     // the longest loopfilter of the next sbrow
-    th = (y + bh + 4 * !!my + 7) >> 5;
+    th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
     ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
     if (x < !!mx * 3 || y < !!my * 3 ||
         x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
@@ -2781,8 +2833,8 @@ static void inter_recon(AVCodecContext *ctx)
         }
 
         // uv itxfm add
-        end_x >>= 1;
-        end_y >>= 1;
+        end_x >>= s->ss_h;
+        end_y >>= s->ss_v;
         step = 1 << (b->uvtx * 2);
         for (p = 0; p < 2; p++) {
             dst = s->dst[p + 1];
@@ -2801,11 +2853,14 @@ static void inter_recon(AVCodecContext *ctx)
     }
 }
 
-static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
+static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
                                         int row_and_7, int col_and_7,
                                         int w, int h, int col_end, int row_end,
                                         enum TxfmMode tx, int skip_inter)
 {
+    static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
+    static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
+
     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
@@ -2816,14 +2871,14 @@ static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
     // a time, and we only use the topleft block's mode information to set
     // things like block strength. Thus, for any block size smaller than
     // 16x16, ignore the odd portion of the block.
-    if (tx == TX_4X4 && is_uv) {
-        if (h == 1) {
+    if (tx == TX_4X4 && (ss_v | ss_h)) {
+        if (h == ss_v) {
             if (row_and_7 & 1)
                 return;
             if (!row_end)
                 h += 1;
         }
-        if (w == 1) {
+        if (w == ss_h) {
             if (col_and_7 & 1)
                 return;
             if (!col_end)
@@ -2833,103 +2888,85 @@ static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
 
     if (tx == TX_4X4 && !skip_inter) {
         int t = 1 << col_and_7, m_col = (t << w) - t, y;
-        int m_col_odd = (t << (w - 1)) - t;
-
         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
-        if (is_uv) {
-            int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
+        int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
 
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 7);
+        for (y = row_and_7; y < h + row_and_7; y++) {
+            int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
 
-                lflvl->mask[is_uv][0][y][1] |= m_row_8;
-                lflvl->mask[is_uv][0][y][2] |= m_row_4;
-                // for odd lines, if the odd col is not being filtered,
-                // skip odd row also:
-                // .---. <-- a
-                // |   |
-                // |___| <-- b
-                // ^   ^
-                // c   d
-                //
-                // if a/c are even row/col and b/d are odd, and d is skipped,
-                // e.g. right edge of size-66x66.webm, then skip b also (bug)
-                if ((col_end & 1) && (y & 1)) {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
-                } else {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
-                }
-            }
-        } else {
-            int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 3);
-
-                lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
-                lflvl->mask[is_uv][0][y][2] |= m_row_4;
-                lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
-                lflvl->mask[is_uv][0][y][3] |= m_col;
-                lflvl->mask[is_uv][1][y][3] |= m_col;
+            mask[0][y][1] |= m_row_8;
+            mask[0][y][2] |= m_row_4;
+            // for odd lines, if the odd col is not being filtered,
+            // skip odd row also:
+            // .---. <-- a
+            // |   |
+            // |___| <-- b
+            // ^   ^
+            // c   d
+            //
+            // if a/c are even row/col and b/d are odd, and d is skipped,
+            // e.g. right edge of size-66x66.webm, then skip b also (bug)
+            if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
+                mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
+            } else {
+                mask[1][y][col_mask_id] |= m_col;
             }
+            if (!ss_h)
+                mask[0][y][3] |= m_col;
+            if (!ss_v)
+                mask[1][y][3] |= m_col;
         }
     } else {
         int y, t = 1 << col_and_7, m_col = (t << w) - t;
 
         if (!skip_inter) {
             int mask_id = (tx == TX_8X8);
-            int l2 = tx + is_uv - 1, step1d = 1 << l2;
             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
+            int l2 = tx + ss_h - 1, step1d;
             int m_row = m_col & masks[l2];
 
             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
             // 8wd loopfilter to prevent going off the visible edge.
-            if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
+            if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
                 int m_row_8 = m_row - m_row_16;
 
                 for (y = row_and_7; y < h + row_and_7; y++) {
-                    lflvl->mask[is_uv][0][y][0] |= m_row_16;
-                    lflvl->mask[is_uv][0][y][1] |= m_row_8;
+                    mask[0][y][0] |= m_row_16;
+                    mask[0][y][1] |= m_row_8;
                 }
             } else {
                 for (y = row_and_7; y < h + row_and_7; y++)
-                    lflvl->mask[is_uv][0][y][mask_id] |= m_row;
+                    mask[0][y][mask_id] |= m_row;
             }
 
-            if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
+            l2 = tx + ss_v - 1;
+            step1d = 1 << l2;
+            if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
-                    lflvl->mask[is_uv][1][y][0] |= m_col;
+                    mask[1][y][0] |= m_col;
                 if (y - row_and_7 == h - 1)
-                    lflvl->mask[is_uv][1][y][1] |= m_col;
+                    mask[1][y][1] |= m_col;
             } else {
                 for (y = row_and_7; y < h + row_and_7; y += step1d)
-                    lflvl->mask[is_uv][1][y][mask_id] |= m_col;
+                    mask[1][y][mask_id] |= m_col;
             }
         } else if (tx != TX_4X4) {
             int mask_id;
 
-            mask_id = (tx == TX_8X8) || (is_uv && h == 1);
-            lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
-            mask_id = (tx == TX_8X8) || (is_uv && w == 1);
+            mask_id = (tx == TX_8X8) || (h == ss_v);
+            mask[1][row_and_7][mask_id] |= m_col;
+            mask_id = (tx == TX_8X8) || (w == ss_h);
             for (y = row_and_7; y < h + row_and_7; y++)
-                lflvl->mask[is_uv][0][y][mask_id] |= t;
-        } else if (is_uv) {
-            int t8 = t & 0x01, t4 = t - t8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
-            }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
+                mask[0][y][mask_id] |= t;
         } else {
-            int t8 = t & 0x11, t4 = t - t8;
+            int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
 
             for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
+                mask[0][y][2] |= t4;
+                mask[0][y][1] |= t8;
             }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
+            mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
         }
     }
 }
@@ -2958,7 +2995,8 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
         b->bl = bl;
         b->bp = bp;
         decode_mode(ctx);
-        b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
+        b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
+                           (s->ss_v && h4 * 2 == (1 << b->tx)));
 
         if (!b->skip) {
             decode_coeffs(ctx);
@@ -2973,34 +3011,39 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
     case 8:  AV_ZERO64(&v);  break; \
     case 16: AV_ZERO128(&v); break; \
     }
-#define SPLAT_ZERO_YUV(dir, var, off, n) \
+#define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
     do { \
         SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
-        SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
-        SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+        if (s->ss_##dir2) { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+        } else { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
+        } \
     } while (0)
 
             switch (w4) {
-            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
-            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
-            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
-            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
+            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
+            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
+            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
+            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
             }
             switch (h4) {
-            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
-            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
-            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
-            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
+            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
+            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
+            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
+            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
             }
         }
         if (s->pass == 1) {
             s->b++;
             s->block += w4 * h4 * 64;
-            s->uvblock[0] += w4 * h4 * 16;
-            s->uvblock[1] += w4 * h4 * 16;
+            s->uvblock[0] += w4 * h4 * 64 >> (s->ss_h + s->ss_v);
+            s->uvblock[1] += w4 * h4 * 64 >> (s->ss_h + s->ss_v);
             s->eob += 4 * w4 * h4;
-            s->uveob[0] += w4 * h4;
-            s->uveob[1] += w4 * h4;
+            s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+            s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
 
             return;
         }
@@ -3073,11 +3116,12 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
         int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
 
         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
-        mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
-        mask_edges(lflvl, 1, row7, col7, x_end, y_end,
-                   s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
-                   s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
-                   b->uvtx, skip_inter);
+        mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
+        if (s->ss_h || s->ss_v)
+            mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
+                       s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
+                       s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
+                       b->uvtx, skip_inter);
 
         if (!s->filter.lim_lut[lvl]) {
             int sharp = s->filter.sharpness;
@@ -3097,11 +3141,11 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
     if (s->pass == 2) {
         s->b++;
         s->block += w4 * h4 * 64;
-        s->uvblock[0] += w4 * h4 * 16;
-        s->uvblock[1] += w4 * h4 * 16;
+        s->uvblock[0] += w4 * h4 * 64 >> (s->ss_v + s->ss_h);
+        s->uvblock[1] += w4 * h4 * 64 >> (s->ss_v + s->ss_h);
         s->eob += 4 * w4 * h4;
-        s->uveob[0] += w4 * h4;
-        s->uveob[1] += w4 * h4;
+        s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+        s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
     }
 }
 
@@ -3131,24 +3175,24 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
             case PARTITION_H:
                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
                 yoff  += hbs * 8 * y_stride;
-                uvoff += hbs * 4 * uv_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
                 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_V:
                 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
                 yoff  += hbs * 8;
-                uvoff += hbs * 4;
+                uvoff += hbs * 8 >> s->ss_h;
                 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_SPLIT:
                 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
                 decode_sb(ctx, row, col + hbs, lflvl,
-                          yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+                          yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
                 yoff  += hbs * 8 * y_stride;
-                uvoff += hbs * 4 * uv_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
                 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
                 decode_sb(ctx, row + hbs, col + hbs, lflvl,
-                          yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+                          yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
                 break;
             default:
                 av_assert0(0);
@@ -3157,7 +3201,7 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
             bp = PARTITION_SPLIT;
             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
             decode_sb(ctx, row, col + hbs, lflvl,
-                      yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+                      yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
         } else {
             bp = PARTITION_H;
             decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
@@ -3167,7 +3211,7 @@ static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *l
             bp = PARTITION_SPLIT;
             decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
             yoff  += hbs * 8 * y_stride;
-            uvoff += hbs * 4 * uv_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
             decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         } else {
             bp = PARTITION_V;
@@ -3196,11 +3240,11 @@ static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filte
         decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
         if (b->bp == PARTITION_H && row + hbs < s->rows) {
             yoff  += hbs * 8 * y_stride;
-            uvoff += hbs * 4 * uv_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
             decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
             yoff  += hbs * 8;
-            uvoff += hbs * 4;
+            uvoff += hbs * 8 >> s->ss_h;
             decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
         }
     } else {
@@ -3208,33 +3252,186 @@ static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filte
         if (col + hbs < s->cols) { // FIXME why not <=?
             if (row + hbs < s->rows) {
                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
-                              uvoff + 4 * hbs, bl + 1);
+                              uvoff + (8 * hbs >> s->ss_h), bl + 1);
                 yoff  += hbs * 8 * y_stride;
-                uvoff += hbs * 4 * uv_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
                 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
                 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
-                                    yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+                                    yoff + 8 * hbs, uvoff + (8 * hbs >> s->ss_h), bl + 1);
             } else {
                 yoff  += hbs * 8;
-                uvoff += hbs * 4;
+                uvoff += hbs * 8 >> s->ss_h;
                 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
             }
         } else if (row + hbs < s->rows) {
             yoff  += hbs * 8 * y_stride;
-            uvoff += hbs * 4 * uv_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
             decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         }
     }
 }
 
+static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
+{
+    int y, x;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
+        unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
+        unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
+        unsigned hm = hm1 | hm2 | hm13 | hm23;
+
+        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 >> ss_h) {
+            if (col || x > 1) {
+                if (hm1 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    if (hmask1[0] & x) {
+                        if (hmask2[0] & x) {
+                            av_assert2(l[8 << ss_v] == L);
+                            s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
+                        }
+                    } else if (hm2 & x) {
+                        L = l[8 << ss_v];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
+                                               [!!(hmask2[1] & x)]
+                                               [0](ptr, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[!!(hmask1[1] & x)]
+                                            [0](ptr, ls, E, I, H);
+                    }
+                } else if (hm2 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    s->dsp.loop_filter_8[!!(hmask2[1] & x)]
+                                        [0](ptr + 8 * ls, ls, E, I, H);
+                }
+            }
+            if (ss_h) {
+                if (x & 0xAA)
+                    l += 2;
+            } else {
+                if (hm13 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    if (hm23 & x) {
+                        L = l[8 << ss_v];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][0](ptr + 4, ls, E, I, H);
+                    }
+                } else if (hm23 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4, ls, E, I, H);
+                }
+                l++;
+            }
+        }
+    }
+}
+
+static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
+{
+    int y, x;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
+        unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
+
+        for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16, l += 2 << ss_h) {
+            if (row || y) {
+                if (vm & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    if (vmask[0] & x) {
+                        if (vmask[0] & (x << (1 + ss_h))) {
+                            av_assert2(l[1 + ss_h] == L);
+                            s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
+                        }
+                    } else if (vm & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
+                                               [!!(vmask[1] & (x << (1 + ss_h)))]
+                                               [1](ptr, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[!!(vmask[1] & x)]
+                                            [1](ptr, ls, E, I, H);
+                    }
+                } else if (vm & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
+                                        [1](ptr + 8, ls, E, I, H);
+                }
+            }
+            if (!ss_v) {
+                if (vm3 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    if (vm3 & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
+                    }
+                } else if (vm3 & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8, ls, E, I, H);
+                }
+            }
+        }
+        if (ss_v) {
+            if (y & 1)
+                lvl += 16;
+        } else {
+            lvl += 8;
+        }
+    }
+}
+
 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
 {
     VP9Context *s = ctx->priv_data;
     AVFrame *f = s->frames[CUR_FRAME].tf.f;
-    uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
+    uint8_t *dst = f->data[0] + yoff;
     ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
-    int y, x, p;
+    uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
+    int p;
 
     // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
     // if you think of them as acting on a 8x8 block max, we can interleave
@@ -3242,225 +3439,13 @@ static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
     // 8 pixel blocks, and we won't always do that (we want at least 16px
     // to use SSE2 optimizations, perhaps 32 for AVX2)
 
-    // filter edges between columns, Y plane (e.g. block1 | block2)
-    for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
-        uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
-        uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
-        unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
-        unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
-        unsigned hm = hm1 | hm2 | hm13 | hm23;
+    filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
+    filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
 
-        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
-            if (hm1 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (col || x > 1) {
-                    if (hmask1[0] & x) {
-                        if (hmask2[0] & x) {
-                            av_assert2(l[8] == L);
-                            s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
-                        }
-                    } else if (hm2 & x) {
-                        L = l[8];
-                        H |= (L >> 4) << 8;
-                        E |= s->filter.mblim_lut[L] << 8;
-                        I |= s->filter.lim_lut[L] << 8;
-                        s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
-                                               [!!(hmask2[1] & x)]
-                                               [0](ptr, ls_y, E, I, H);
-                    } else {
-                        s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                            [0](ptr, ls_y, E, I, H);
-                    }
-                }
-            } else if (hm2 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (col || x > 1) {
-                    s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                        [0](ptr + 8 * ls_y, ls_y, E, I, H);
-                }
-            }
-            if (hm13 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (hm23 & x) {
-                    L = l[8];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
-                }
-            } else if (hm23 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
-            }
-        }
-    }
-
-    //                                          block1
-    // filter edges between rows, Y plane (e.g. ------)
-    //                                          block2
-    dst = f->data[0] + yoff;
-    lvl = lflvl->level;
-    for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
-        uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
-        unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
-
-        for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
-            if (row || y) {
-                if (vm & x) {
-                    int L = *l, H = L >> 4;
-                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                    if (vmask[0] & x) {
-                        if (vmask[0] & (x << 1)) {
-                            av_assert2(l[1] == L);
-                            s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
-                        }
-                    } else if (vm & (x << 1)) {
-                        L = l[1];
-                        H |= (L >> 4) << 8;
-                        E |= s->filter.mblim_lut[L] << 8;
-                        I |= s->filter.lim_lut[L] << 8;
-                        s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                               [!!(vmask[1] & (x << 1))]
-                                               [1](ptr, ls_y, E, I, H);
-                    } else {
-                        s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                            [1](ptr, ls_y, E, I, H);
-                    }
-                } else if (vm & (x << 1)) {
-                    int L = l[1], H = L >> 4;
-                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                    s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
-                                        [1](ptr + 8, ls_y, E, I, H);
-                }
-            }
-            if (vm3 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                if (vm3 & (x << 1)) {
-                    L = l[1];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                }
-            } else if (vm3 & (x << 1)) {
-                int L = l[1], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
-            }
-        }
-    }
-
-    // same principle but for U/V planes
     for (p = 0; p < 2; p++) {
-        lvl = lflvl->level;
         dst = f->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
-            uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
-            uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
-            unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
-            unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
-
-            for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
-                if (col || x > 1) {
-                    if (hm1 & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                        if (hmask1[0] & x) {
-                            if (hmask2[0] & x) {
-                                av_assert2(l[16] == L);
-                                s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (hm2 & x) {
-                            L = l[16];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
-                                                   [!!(hmask2[1] & x)]
-                                                   [0](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                                [0](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (hm2 & x) {
-                        int L = l[16], H = L >> 4;
-                        int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                        s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                            [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
-                    }
-                }
-                if (x & 0xAA)
-                    l += 2;
-            }
-        }
-        lvl = lflvl->level;
-        dst = f->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
-            uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
-            unsigned vm = vmask[0] | vmask[1] | vmask[2];
-
-            for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
-                if (row || y) {
-                    if (vm & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                        if (vmask[0] & x) {
-                            if (vmask[0] & (x << 2)) {
-                                av_assert2(l[2] == L);
-                                s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (vm & (x << 2)) {
-                            L = l[2];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                                   [!!(vmask[1] & (x << 2))]
-                                                   [1](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                                [1](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (vm & (x << 2)) {
-                        int L = l[2], H = L >> 4;
-                        int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
-
-                        s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
-                                            [1](ptr + 8, ls_uv, E, I, H);
-                    }
-                }
-            }
-            if (y & 1)
-                lvl += 16;
-        }
+        filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
+        filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
     }
 }
 
@@ -3815,18 +3800,6 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
             return res;
     }
 
-    if (s->fullrange)
-        ctx->color_range = AVCOL_RANGE_JPEG;
-    else
-        ctx->color_range = AVCOL_RANGE_MPEG;
-
-    switch (s->colorspace) {
-    case 1: ctx->colorspace = AVCOL_SPC_BT470BG; break;
-    case 2: ctx->colorspace = AVCOL_SPC_BT709; break;
-    case 3: ctx->colorspace = AVCOL_SPC_SMPTE170M; break;
-    case 4: ctx->colorspace = AVCOL_SPC_SMPTE240M; break;
-    }
-
     // main tile decode loop
     memset(s->above_partition_ctx, 0, s->cols);
     memset(s->above_skip_ctx, 0, s->cols);
@@ -3836,8 +3809,8 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
         memset(s->above_mode_ctx, NEARESTMV, s->cols);
     }
     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
-    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
-    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
+    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
+    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
     memset(s->above_segpred_ctx, 0, s->cols);
     s->pass = s->frames[CUR_FRAME].uses_2pass =
         ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
@@ -3905,7 +3878,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
             }
 
             for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
-                 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
+                 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
                 struct VP9Filter *lflvl_ptr = s->lflvl;
                 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
 
@@ -3922,7 +3895,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                             memset(s->left_mode_ctx, NEARESTMV, 8);
                         }
                         memset(s->left_y_nnz_ctx, 0, 16);
-                        memset(s->left_uv_nnz_ctx, 0, 16);
+                        memset(s->left_uv_nnz_ctx, 0, 32);
                         memset(s->left_segpred_ctx, 0, 8);
 
                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
@@ -3930,7 +3903,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
 
                     for (col = s->tiling.tile_col_start;
                          col < s->tiling.tile_col_end;
-                         col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
+                         col += 8, yoff2 += 64, uvoff2 += 64 >> s->ss_h, lflvl_ptr++) {
                         // FIXME integrate with lf code (i.e. zero after each
                         // use, similar to invtxfm coefficients, or similar)
                         if (s->pass != 1) {
@@ -3961,11 +3934,11 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                            f->data[0] + yoff + 63 * ls_y,
                            8 * s->cols);
                     memcpy(s->intra_pred_data[1],
-                           f->data[1] + uvoff + 31 * ls_uv,
-                           4 * s->cols);
+                           f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                           8 * s->cols >> s->ss_h);
                     memcpy(s->intra_pred_data[2],
-                           f->data[2] + uvoff + 31 * ls_uv,
-                           4 * s->cols);
+                           f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                           8 * s->cols >> s->ss_h);
                 }
 
                 // loopfilter one row
@@ -3974,7 +3947,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                     uvoff2 = uvoff;
                     lflvl_ptr = s->lflvl;
                     for (col = 0; col < s->cols;
-                         col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
+                         col += 8, yoff2 += 64, uvoff2 += 64 >> s->ss_h, lflvl_ptr++) {
                         loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
                     }
                 }
@@ -4051,7 +4024,6 @@ static av_cold int vp9_decode_init(AVCodecContext *ctx)
     VP9Context *s = ctx->priv_data;
 
     ctx->internal->allocate_progress = 1;
-    ctx->pix_fmt = AV_PIX_FMT_YUV420P;
     ff_vp9dsp_init(&s->dsp);
     ff_videodsp_init(&s->vdsp, 8);
     s->filter.sharpness = -1;
@@ -4094,6 +4066,8 @@ static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecCo
 
     s->invisible = ssrc->invisible;
     s->keyframe = ssrc->keyframe;
+    s->ss_v = ssrc->ss_v;
+    s->ss_h = ssrc->ss_h;
     s->segmentation.enabled = ssrc->segmentation.enabled;
     s->segmentation.update_map = ssrc->segmentation.update_map;
     memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
index c6ae432e26..f94438fa06 100644
--- a/libavcodec/vp9_mc_template.c
+++ b/libavcodec/vp9_mc_template.c
@@ -21,6 +21,12 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define ROUNDED_DIV_MVx2(a, b) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x, 2), .y = ROUNDED_DIV(a.y + b.y, 2) }
+#define ROUNDED_DIV_MVx4(a, b, c, d) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x + c.x + d.x, 4), \
+               .y = ROUNDED_DIV(a.y + b.y + c.y + d.y, 4) }
+
 static void FN(inter_pred)(AVCodecContext *ctx)
 {
     static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
@@ -44,6 +50,8 @@ static void FN(inter_pred)(AVCodecContext *ctx)
 
     // y inter pred
     if (b->bs > BS_8x8) {
+        VP56mv uvmv;
+
         if (b->bs == BS_8x4) {
             mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
@@ -52,6 +60,38 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                         s->dst[0] + 4 * ls_y, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
                         (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1, 0);
+            w1 = (w1 + s->ss_h) >> s->ss_h;
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 2, col << (3 - s->ss_h),
+                              &uvmv, 8 >> s->ss_h, 4, w1, h1, 0);
+            } else {
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 3, col << (3 - s->ss_h),
+                              &b->mv[0][0], 8 >> s->ss_h, 4, w1, h1, 0);
+                // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                // to get the motion vector for the bottom 4x4 block
+                // https://code.google.com/p/webm/issues/detail?id=993
+                if (s->ss_h == 0) {
+                    uvmv = b->mv[2][0];
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                }
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              (row << 3) + 4, col << (3 - s->ss_h),
+                              &uvmv, 8 >> s->ss_h, 4, w1, h1, 0);
+            }
 
             if (b->comp) {
                 mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y,
@@ -61,6 +101,38 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                             s->dst[0] + 4 * ls_y, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
                             (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2, 1);
+                w2 = (w2 + s->ss_h) >> s->ss_h;
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 2, col << (3 - s->ss_h),
+                                  &uvmv, 8 >> s->ss_h, 4, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 3, col << (3 - s->ss_h),
+                                  &b->mv[0][1], 8 >> s->ss_h, 4, w2, h2, 1);
+                    // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                    // to get the motion vector for the bottom 4x4 block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    if (s->ss_h == 0) {
+                        uvmv = b->mv[2][1];
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    }
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  (row << 3) + 4, col << (3 - s->ss_h),
+                                  &uvmv, 8 >> s->ss_h, 4, w2, h2, 1);
+                }
             }
         } else if (b->bs == BS_4x8) {
             mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
@@ -69,6 +141,30 @@ static void FN(inter_pred)(AVCodecContext *ctx)
             mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
                         row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1, 0);
+            h1 = (h1 + s->ss_v) >> s->ss_v;
+            if (s->ss_h) {
+                w1 = (w1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 2,
+                              &uvmv, 4, 8 >> s->ss_v, w1, h1, 0);
+            } else {
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 3,
+                              &b->mv[0][0], 4, 8 >> s->ss_v, w1, h1, 0);
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1] + 4, s->dst[2] + 4, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), (col << 3) + 4,
+                              &b->mv[1][0], 4, 8 >> s->ss_v, w1, h1, 0);
+            }
 
             if (b->comp) {
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
@@ -77,6 +173,30 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
                             row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2, 1);
+                h2 = (h2 + s->ss_v) >> s->ss_v;
+                if (s->ss_h) {
+                    w2 = (w2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 2,
+                                  &uvmv, 4, 8 >> s->ss_v, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 3,
+                                  &b->mv[0][1], 4, 8 >> s->ss_v, w2, h2, 1);
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1] + 4, s->dst[2] + 4, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), (col << 3) + 4,
+                                  &b->mv[1][1], 4, 8 >> s->ss_v, w2, h2, 1);
+                }
             }
         } else {
             av_assert2(b->bs == BS_4x4);
@@ -97,6 +217,81 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                         s->dst[0] + 4 * ls_y + 4, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
                         (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1, 0);
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx4(b->mv[0][0], b->mv[1][0],
+                                            b->mv[2][0], b->mv[3][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 2,
+                                  &uvmv, 4, 4, w1, h1, 0);
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 3,
+                                  &uvmv, 4, 4, w1, h1, 0);
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4, s->dst[2] + 4, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, (col << 3) + 4,
+                                  &uvmv, 4, 4, w1, h1, 0);
+                }
+            } else {
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 2,
+                                  &uvmv, 4, 4, w1, h1, 0);
+                    // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                    // bottom block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[2][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 2,
+                                  &uvmv, 4, 4, w1, h1, 0);
+                } else {
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 3,
+                                  &b->mv[0][0], 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4, s->dst[2] + 4, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, (col << 3) + 4,
+                                  &b->mv[1][0], 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 3,
+                                  &b->mv[2][0], 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv + 4, s->dst[2] + 4 * ls_uv + 4, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, (col << 3) + 4,
+                                  &b->mv[3][0], 4, 4, w1, h1, 0);
+                }
+            }
 
             if (b->comp) {
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
@@ -113,59 +308,112 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                             s->dst[0] + 4 * ls_y + 4, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
                             (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2, 1);
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx4(b->mv[0][1], b->mv[1][1],
+                                                b->mv[2][1], b->mv[3][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 2,
+                                      &uvmv, 4, 4, w2, h2, 1);
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 3,
+                                      &uvmv, 4, 4, w2, h2, 1);
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4, s->dst[2] + 4, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, (col << 3) + 4,
+                                      &uvmv, 4, 4, w2, h2, 1);
+                    }
+                } else {
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 2,
+                                      &uvmv, 4, 4, w2, h2, 1);
+                        // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                        // bottom block
+                        // https://code.google.com/p/webm/issues/detail?id=993
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[2][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 2,
+                                      &uvmv, 4, 4, w2, h2, 1);
+                    } else {
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 3,
+                                      &b->mv[0][1], 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4, s->dst[2] + 4, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, (col << 3) + 4,
+                                      &b->mv[1][1], 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 3,
+                                      &b->mv[2][1], 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv + 4, s->dst[2] + 4 * ls_uv + 4, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, (col << 3) + 4,
+                                      &b->mv[3][1], 4, 4, w2, h2, 1);
+                    }
+                }
             }
         }
     } else {
         int bwl = bwlog_tab[0][b->bs];
         int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
+        int uvbw = bwh_tab[s->ss_h][b->bs][0] * 4, uvbh = bwh_tab[s->ss_v][b->bs][1] * 4;
 
         mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
                     ref1->data[0], ref1->linesize[0], tref1,
-                    row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1, 0);
-
-        if (b->comp)
-            mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
-                        ref2->data[0], ref2->linesize[0], tref2,
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2, 1);
-    }
-
-    // uv inter pred
-    {
-        int bwl = bwlog_tab[1][b->bs];
-        int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
-        VP56mv mvuv;
-
-        w1 = (w1 + 1) >> 1;
-        h1 = (h1 + 1) >> 1;
-        if (b->comp) {
-            w2 = (w2 + 1) >> 1;
-            h2 = (h2 + 1) >> 1;
-        }
-        if (b->bs > BS_8x8) {
-            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
-            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
-        } else {
-            mvuv = b->mv[0][0];
-        }
-
-        mc_chroma_dir(s, mc[bwl][b->filter][0],
+                    row << 3, col << 3, &b->mv[0][0], bw, bh, w1, h1, 0);
+        w1 = (w1 + s->ss_h) >> s->ss_h;
+        h1 = (h1 + s->ss_v) >> s->ss_v;
+        mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][0],
                       s->dst[1], s->dst[2], ls_uv,
                       ref1->data[1], ref1->linesize[1],
                       ref1->data[2], ref1->linesize[2], tref1,
-                      row << 2, col << 2, &mvuv, bw, bh, w1, h1, 0);
+                      row << (3 - s->ss_v), col << (3 - s->ss_h),
+                      &b->mv[0][0], uvbw, uvbh, w1, h1, 0);
 
         if (b->comp) {
-            if (b->bs > BS_8x8) {
-                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
-                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
-            } else {
-                mvuv = b->mv[0][1];
-            }
-            mc_chroma_dir(s, mc[bwl][b->filter][1],
+            mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
+                        ref2->data[0], ref2->linesize[0], tref2,
+                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2, 1);
+            w2 = (w2 + s->ss_h) >> s->ss_h;
+            h2 = (h2 + s->ss_v) >> s->ss_v;
+            mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][1],
                           s->dst[1], s->dst[2], ls_uv,
                           ref2->data[1], ref2->linesize[1],
                           ref2->data[2], ref2->linesize[2], tref2,
-                          row << 2, col << 2, &mvuv, bw, bh, w2, h2, 1);
+                          row << (3 - s->ss_v), col << (3 - s->ss_h),
+                          &b->mv[0][1], uvbw, uvbh, w2, h2, 1);
         }
     }
 }
diff --git a/tests/fate/vpx.mak b/tests/fate/vpx.mak
index d77d5cb5c1..3bc8e9a38e 100644
--- a/tests/fate/vpx.mak
+++ b/tests/fate/vpx.mak
@@ -85,6 +85,12 @@ fate-vp9$(2)-$(1): CMD = framemd5 $(3) -i $(TARGET_SAMPLES)/vp9-test-vectors/vp9
 fate-vp9$(2)-$(1): REF = $(SRC_PATH)/tests/ref/fate/vp9-$(1)
 endef
 
+define FATE_VP9_PROFILE_SUITE
+FATE_VP9-$(CONFIG_MATROSKA_DEMUXER) += fate-vp9p$(2)-$(1)
+fate-vp9p$(2)-$(1): CMD = framemd5 -i $(TARGET_SAMPLES)/vp9-test-vectors/vp9$(2)-2-$(1).webm
+fate-vp9p$(2)-$(1): REF = $(SRC_PATH)/tests/ref/fate/vp9p$(2)-$(1)
+endef
+
 VP9_Q = 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 \
         16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 \
         32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 \
@@ -94,24 +100,27 @@ VP9_SIZE_A = 08 10 16 18 32 34 64 66
 VP9_SIZE_B = 196 198 200 202 208 210 224 226
 
 define FATE_VP9_FULL
-$(foreach Q,$(VP9_Q),$(eval $(call FATE_VP9_SUITE,00-quantizer-$(Q),$(1),$(2))))
-$(foreach SHARP,$(VP9_SHARP),$(eval $(call FATE_VP9_SUITE,01-sharpness-$(SHARP),$(1),$(2))))
-$(foreach W,$(VP9_SIZE_A),$(eval $(foreach H,$(VP9_SIZE_A),$(eval $(call FATE_VP9_SUITE,02-size-$(W)x$(H),$(1),$(2))))))
-$(foreach W,$(VP9_SIZE_B),$(eval $(foreach H,$(VP9_SIZE_B),$(eval $(call FATE_VP9_SUITE,03-size-$(W)x$(H),$(1),$(2))))))
-$(eval $(call FATE_VP9_SUITE,03-deltaq,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,06-bilinear,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,09-lf_deltas,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,10-show-existing-frame,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,10-show-existing-frame2,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,15-segkey_adpq,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,16-intra-only,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,2pass-akiyo,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,parallelmode-akiyo,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,segmentation-aq-akiyo,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,segmentation-sf-akiyo,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,tiling-pedestrian,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,trac3849,$(1),$(2)))
-$(eval $(call FATE_VP9_SUITE,trac4359,$(1),$(2)))
+$(foreach Q,$(VP9_Q),$(eval $(call FATE_VP9_SUITE,00-quantizer-$(Q))))
+$(foreach SHARP,$(VP9_SHARP),$(eval $(call FATE_VP9_SUITE,01-sharpness-$(SHARP))))
+$(foreach W,$(VP9_SIZE_A),$(eval $(foreach H,$(VP9_SIZE_A),$(eval $(call FATE_VP9_SUITE,02-size-$(W)x$(H))))))
+$(foreach W,$(VP9_SIZE_B),$(eval $(foreach H,$(VP9_SIZE_B),$(eval $(call FATE_VP9_SUITE,03-size-$(W)x$(H))))))
+$(eval $(call FATE_VP9_SUITE,03-deltaq))
+$(eval $(call FATE_VP9_PROFILE_SUITE,04-yuv444,1))
+$(eval $(call FATE_VP9_PROFILE_SUITE,04-yuv440,1))
+$(eval $(call FATE_VP9_PROFILE_SUITE,04-yuv422,1))
+$(eval $(call FATE_VP9_SUITE,06-bilinear))
+$(eval $(call FATE_VP9_SUITE,09-lf_deltas))
+$(eval $(call FATE_VP9_SUITE,10-show-existing-frame))
+$(eval $(call FATE_VP9_SUITE,10-show-existing-frame2))
+$(eval $(call FATE_VP9_SUITE,15-segkey_adpq))
+$(eval $(call FATE_VP9_SUITE,16-intra-only))
+$(eval $(call FATE_VP9_SUITE,2pass-akiyo))
+$(eval $(call FATE_VP9_SUITE,parallelmode-akiyo))
+$(eval $(call FATE_VP9_SUITE,segmentation-aq-akiyo))
+$(eval $(call FATE_VP9_SUITE,segmentation-sf-akiyo))
+$(eval $(call FATE_VP9_SUITE,tiling-pedestrian))
+$(eval $(call FATE_VP9_SUITE,trac3849))
+$(eval $(call FATE_VP9_SUITE,trac4359))
 endef
 
 $(eval $(call FATE_VP9_FULL))
diff --git a/tests/ref/fate/vp9p1-04-yuv422 b/tests/ref/fate/vp9p1-04-yuv422
new file mode 100644
index 0000000000..59abfb002c
--- /dev/null
+++ b/tests/ref/fate/vp9p1-04-yuv422
@@ -0,0 +1,15 @@
+#format: frame checksums
+#version: 1
+#hash: MD5
+#tb 0: 1/50
+#stream#, dts,        pts, duration,     size, hash
+0,          0,          0,        1,    28800, b81b8a8444ac6ce4a4807c37e0a44c8b
+0,          1,          1,        1,    28800, 344458b82d35ea9944dc841643fc25c2
+0,          2,          2,        1,    28800, 376a4bb3944f052191963740b980eb26
+0,          3,          3,        1,    28800, 2fecb02c842bd7d588415904f2d3a82d
+0,          4,          4,        1,    28800, 0fda2f1dabba5c179599190f179b9782
+0,          5,          5,        1,    28800, a88ac885ee59e3a3a01fa483cdd40274
+0,          6,          6,        1,    28800, e76b488ffa70a05457fc046e7b999c56
+0,          7,          7,        1,    28800, 74ae5e52162f5bbc95258d44a2dd647c
+0,          8,          8,        1,    28800, 0c017e2b12e5192c8d598941d9c93306
+0,          9,          9,        1,    28800, ca3941ee43b7033cb48f8498af127d53
diff --git a/tests/ref/fate/vp9p1-04-yuv440 b/tests/ref/fate/vp9p1-04-yuv440
new file mode 100644
index 0000000000..0c28f36535
--- /dev/null
+++ b/tests/ref/fate/vp9p1-04-yuv440
@@ -0,0 +1,15 @@
+#format: frame checksums
+#version: 1
+#hash: MD5
+#tb 0: 1/50
+#stream#, dts,        pts, duration,     size, hash
+0,          0,          0,        1,    28800, 61157ad4fb02a254de8f34ae7b8915dc
+0,          1,          1,        1,    28800, 9431337382bf90d40aa417e297ac05da
+0,          2,          2,        1,    28800, 56b739049cc9e97a1d82018bba3db0ee
+0,          3,          3,        1,    28800, 75138a9b6bb905b2f79a1ebb959ddfea
+0,          4,          4,        1,    28800, 141b2fc9625fad86577838d84a276ef8
+0,          5,          5,        1,    28800, b364668c44a237d4e532e086a55401a9
+0,          6,          6,        1,    28800, a4ca6014d5194e4c921a4cb4289eb315
+0,          7,          7,        1,    28800, cfcacb3d5086d3861f4712a3c87a6b6c
+0,          8,          8,        1,    28800, 228d3fd3d849d021f3690cc538edb0a3
+0,          9,          9,        1,    28800, 97ecf281eb1130723d70e3c8803fa814
diff --git a/tests/ref/fate/vp9p1-04-yuv444 b/tests/ref/fate/vp9p1-04-yuv444
new file mode 100644
index 0000000000..e9559c6c6e
--- /dev/null
+++ b/tests/ref/fate/vp9p1-04-yuv444
@@ -0,0 +1,15 @@
+#format: frame checksums
+#version: 1
+#hash: MD5
+#tb 0: 1/25
+#stream#, dts,        pts, duration,     size, hash
+0,          0,          0,        1,   304128, 859df7b3661783e337a16ee79f3c20bc
+0,          1,          1,        1,   304128, 3b3ccf344cd5a478c4c1fa422497183d
+0,          2,          2,        1,   304128, 3be1f565823cb88013a14a93a3cf9480
+0,          3,          3,        1,   304128, 6e188a963deaf46c2d6e741b03c4240c
+0,          4,          4,        1,   304128, 82ead184ae478ac821b1b4b72f28c9cd
+0,          5,          5,        1,   304128, 59bb43badc76b39a228b1ad96b6339ca
+0,          6,          6,        1,   304128, 2eaee790fc188e2251b92dd4ea90c42a
+0,          7,          7,        1,   304128, 2a95f8727589e710dc1b95400916b72e
+0,          8,          8,        1,   304128, b7032f73544a7108fcdcaca2832ecc32
+0,          9,          9,        1,   304128, b7778c35b30bcc400b25ed0e5b7913e1