/*
* This file is part of mpv.
*
* mpv is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* mpv is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with mpv. If not, see .
*/
#include
#include
#include
#include
#include
#include "misc/ctype.h"
#include "common/common.h"
#include "common/msg.h"
#include "options/options.h"
#include "sd.h"
// Filter for removing subtitle additions for deaf or hard-of-hearing (SDH)
// This is for English, but may in part work for others too.
// The intention is that it can always be active so may not remove
// all SDH parts.
// It is for filtering ASS encoded subtitles
static const char *const enclosure_pair[][2] = {
{"(", ")"},
{"[", "]"},
{"\uFF08", "\uFF09"},
{0},
};
struct buffer {
char *string;
int length;
int pos;
};
static void init_buf(struct buffer *buf, int length)
{
buf->string = talloc_size(NULL, length);
buf->pos = 0;
buf->length = length;
}
static inline int append(struct sd_filter *sd, struct buffer *buf, char c)
{
if (buf->pos >= 0 && buf->pos < buf->length) {
buf->string[buf->pos++] = c;
} else {
// ensure that terminating \0 is always written
if (c == '\0')
buf->string[buf->length - 1] = c;
}
return c;
}
static int get_char_bytes(char *str)
{
// In case the first character is non-ASCII.
// Will only work with UTF-8 but you shouldn't be
// using anything else anyway.
if (str && str[0]) {
if (!(str[0] >> 7 & 1)) {
return strnlen(str, 1);
} else if (!(str[0] >> 5 & 1)) {
return strnlen(str, 2);
} else if (!(str[0] >> 4 & 1)) {
return strnlen(str, 3);
} else if (!(str[0] >> 3 & 1)) {
return strnlen(str, 4);
}
}
return 0;
}
static const char *get_right_enclosure(char *left)
{
// See if the right hand character is mapped. If not, just return the same thing.
for (int i = 0; enclosure_pair[i][0]; i++) {
if (strcmp(left, enclosure_pair[i][0]) == 0)
return enclosure_pair[i][1];
}
return left;
}
static bool valid_left_enclosure(struct sd_filter *sd, char *str)
{
// All characters in this string are valid left hand enclosure characters.
char *enclosures = sd->opts->sub_filter_SDH_enclosures;
int len = strlen(enclosures);
for (int i = 0; i < len; i++) {
if (str && str[0] && str[0] == enclosures[i])
return true;
}
return false;
}
// copy ass override tags, if they exist att current position,
// from source string to destination buffer stopping at first
// character following last sequence of '{text}'
//
// Parameters:
// rpp read pointer pointer to source string, updated on return
// buf write buffer
//
// on return the read pointer is updated to the position after
// the tags.
static void copy_ass(struct sd_filter *sd, char **rpp, struct buffer *buf)
{
char *rp = *rpp;
while (rp[0] == '{') {
while (*rp) {
char tmp = append(sd, buf, rp[0]);
rp++;
if (tmp == '}')
break;
}
}
*rpp = rp;
return;
}
static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
const char *left, const char *right);
// check for speaker label, like MAN:
// normal subtitles may include mixed case text with : after so
// only upper case is accepted and lower case l which for some
// looks like upper case I unless filter_harder - then
// lower case is also acceptable
//
// Parameters:
// rpp read pointer pointer to source string, updated on return
// buf write buffer
//
// scan in source string and copy ass tags to destination string
// skipping speaker label if it exists
//
// if no label was found read pointer and write position in buffer
// will be unchanged
// otherwise they point to next position after label and next write position
static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer *buf)
{
int filter_harder = sd->opts->sub_filter_SDH_harder;
char *rp = *rpp;
int old_pos = buf->pos;
copy_ass(sd, &rp, buf);
// copy any leading "- "
if (rp[0] == '-') {
append(sd, buf, rp[0]);
rp++;
}
copy_ass(sd, &rp, buf);
while (rp[0] == ' ') {
append(sd, buf, rp[0]);
rp++;
copy_ass(sd, &rp, buf);
}
// skip past valid data searching for :
while (*rp && rp[0] != ':') {
if (rp[0] == '{') {
copy_ass(sd, &rp, buf);
} else if (rp[0] == '[') {
// not uncommon with [xxxx]: which should also be skipped
if (!skip_enclosed(sd, &rp, buf, "[", "]")) {
buf->pos = old_pos;
return;
}
} else if ((mp_isalpha(rp[0]) &&
(filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
mp_isdigit(rp[0]) ||
rp[0] == ' ' || rp[0] == '\'' ||
(filter_harder && (rp[0] == '(' || rp[0] == ')')) ||
rp[0] == '#' || rp[0] == '.' || rp[0] == ',') {
rp++;
} else {
buf->pos = old_pos;
return;
}
}
if (!*rp) {
// : was not found
buf->pos = old_pos;
return;
}
rp++; // skip :
copy_ass(sd, &rp, buf);
if (!*rp) {
// end of data
} else if (rp[0] == '\\' && rp[1] == 'N') {
// line end follows - skip it as line is empty
rp += 2;
} else if (rp[0] == ' ') {
while (rp[0] == ' ') {
rp++;
}
if (rp[0] == '\\' && rp[1] == 'N') {
// line end follows - skip it as line is empty
rp += 2;
}
} else {
// non space follows - no speaker label
buf->pos = old_pos;
return;
}
*rpp = rp;
return;
}
// Check for text enclosed in symbols, like (SOUND)
// and skip it while preserving ass tags.
// Parentheses are a special case since normal subtitles may have
// them so only upper case is accepted and lower case l which for
// some looks like upper case I. If sub_filter_SDH_harder is used,
// both upper and lower case is accepted.
//
// For other symbols, all text in between is removed.
//
// Parameters:
// rpp read pointer pointer to source string, updated on return
// buf write buffer
//
// scan in source string
// the first character in source string must be the starting left symbol
// and copy ass tags to destination string but
// skipping enclosed text if it looks like SDH
//
// return true if enclosed text was removed.
// if not valid SDH read pointer and write buffer position will be unchanged
// otherwise they point to next position after text and next write position
static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
const char *left, const char *right)
{
bool filter_harder = sd->opts->sub_filter_SDH_harder;
char *rp = *rpp;
int old_pos = buf->pos;
bool parenthesis = strcmp(left, "(") == 0 || strcmp(left, "\uFF08") == 0;
// skip past the left character
rp += get_char_bytes(rp);
// skip past valid data searching for the right character
bool only_digits = parenthesis;
while (*rp && rp[0] != right[0]) {
if (rp[0] == '{') {
copy_ass(sd, &rp, buf);
} else if (parenthesis && ((mp_isalpha(rp[0]) &&
(filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
mp_isdigit(rp[0]) ||
rp[0] == ' ' || rp[0] == '\'' || rp[0] == '#' ||
rp[0] == '.' || rp[0] == ',' ||
rp[0] == '-' || rp[0] == '"' || rp[0] == '\\')) {
if (!mp_isdigit(rp[0]))
only_digits = false;
rp++;
} else if (parenthesis) {
buf->pos = old_pos;
return false;
} else {
rp++;
}
}
if (!*rp) {
// ) was not found
buf->pos = old_pos;
return false;
}
if (only_digits) {
// number within parentheses is probably not SDH
buf->pos = old_pos;
return false;
}
// skip past the right character
rp += get_char_bytes(rp);
// skip trailing spaces
while (rp[0] == ' ') {
rp++;
}
*rpp = rp;
return true;
}
// remove leading hyphen and following spaces in write buffer
//
// Parameters:
// start_pos start position i buffer
// buf buffer to remove in
//
// when removing characters the following are moved back
//
static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos,
struct buffer *buf)
{
int old_pos = buf->pos;
if (start_pos < 0 || start_pos >= old_pos)
return;
append(sd, buf, '\0'); // \0 terminate for reading
// move past leading ass tags
while (buf->string[start_pos] == '{') {
while (buf->string[start_pos] && buf->string[start_pos] != '}') {
start_pos++;
}
if (buf->string[start_pos])
start_pos++; // skip past '}'
}
// if there is not a leading '-' no removing will be done
if (buf->string[start_pos] != '-') {
buf->pos = old_pos;
return;
}
char *rp = &buf->string[start_pos]; // read from here
buf->pos = start_pos; // start writing here
rp++; // skip '-'
copy_ass(sd, &rp, buf);
while (rp[0] == ' ') {
rp++; // skip ' '
copy_ass(sd, &rp, buf);
}
while (*rp) {
// copy the rest
append(sd, buf, rp[0]);
rp++;
}
}
// Filter ASS formatted string for SDH
//
// Parameters:
// data ASS line
// length length of ASS line
// toff Text offset from data. required: 0 <= toff <= length
//
// Returns a talloc allocated string with filtered ASS data (may be the same
// content as original if no SDH was found) which must be released
// by caller using talloc_free.
//
// Returns NULL if filtering resulted in all of ASS data being removed so no
// subtitle should be output
static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t toff)
{
struct buffer writebuf;
struct buffer *buf = &writebuf;
init_buf(buf, length + 1); // with room for terminating '\0'
// pre-text headers into buf, rp is the (null-terminated) remaining text
char *ass = talloc_strndup(NULL, data, length), *rp = ass;
while (rp - ass < toff)
append(sd, buf, *rp++);
bool contains_text = false; // true if non SDH text was found
bool line_with_text = false; // if last line contained text
int wp_line_start = buf->pos; // write pos to start of last line
int wp_line_end = buf->pos; // write pos to end of previous line with text (\N)
// go through the lines in the text
// they are separated by \N
while (*rp) {
line_with_text = false;
wp_line_start = buf->pos;
// skip any speaker label
skip_speaker_label(sd, &rp, buf);
// go through the rest of the line looking for SDH in () or []
while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
copy_ass(sd, &rp, buf);
char left[5] = {0};
const char *right = NULL;
if (valid_left_enclosure(sd, rp)) {
int bytes = get_char_bytes(rp);
for (int i = 0; i < bytes; i++)
left[i] = rp[i];
left[bytes] = '\0';
right = get_right_enclosure(left);
}
if (left[0] && right && right[0]) {
if (!skip_enclosed(sd, &rp, buf, left, right)) {
append(sd, buf, rp[0]);
rp++;
line_with_text = true;
}
} else if (*rp && rp[0] != '\\') {
if ((rp[0] > 32 && rp[0] < 127 && rp[0] != '-') ||
(unsigned char)rp[0] >= 0xC0)
{
line_with_text = true;
}
append(sd, buf, rp[0]);
rp++;
} else if (rp[0] == '\\' && rp[1] != 'N') {
append(sd, buf, rp[0]);
rp++;
}
}
// either end of data or ASS line end defined by separating \N
if (*rp) {
// ASS line end
if (line_with_text) {
contains_text = true;
wp_line_end = buf->pos;
append(sd, buf, rp[0]); // copy backslash
append(sd, buf, rp[1]); // copy N
rp += 2; // move read pointer past \N
} else {
// no text in line, remove leading hyphen and spaces
remove_leading_hyphen_space(sd, wp_line_start, buf);
// and join with next line
rp += 2; // move read pointer past \N
}
}
}
// if no normal text in last line - remove last line
// by moving write pointer to start of last line
if (!line_with_text) {
buf->pos = wp_line_end;
} else {
contains_text = true;
}
talloc_free(ass);
if (contains_text) {
// the ASS data contained normal text after filtering
append(sd, buf, '\0'); // '\0' terminate
return buf->string;
} else {
// all data removed by filtering
talloc_free(buf->string);
return NULL;
}
}
static bool sdh_init(struct sd_filter *ft)
{
if (strcmp(ft->codec, "ass") != 0)
return false;
if (!ft->opts->sub_filter_SDH)
return false;
if (!ft->event_format) {
MP_VERBOSE(ft, "SDH filtering not possible - format missing\n");
return false;
}
return true;
}
static struct demux_packet *sdh_filter(struct sd_filter *ft,
struct demux_packet *pkt)
{
bstr text = sd_ass_pkt_text(ft, pkt, sd_ass_fmt_offset(ft->event_format));
if (!text.start || !text.len || pkt->len >= INT_MAX)
return pkt; // we don't touch it
ptrdiff_t toff = text.start - pkt->buffer;
char *line = filter_SDH(ft, (char *)pkt->buffer, (int)pkt->len, toff);
if (!line)
return NULL;
if (0 == bstrcmp0((bstr){(char *)pkt->buffer, pkt->len}, line)) {
talloc_free(line);
return pkt; // unmodified, no need to allocate new packet
}
// Stupidly, this copies it again. One could possibly allocate the packet
// for writing in the first place (new_demux_packet()) and use
// demux_packet_shorten().
struct demux_packet *npkt = new_demux_packet_from(line, strlen(line));
if (npkt)
demux_packet_copy_attribs(npkt, pkt);
talloc_free(line);
return npkt;
}
const struct sd_filter_functions sd_filter_sdh = {
.init = sdh_init,
.filter = sdh_filter,
};