在某网站中使用的字幕文件名猜测视频文件名的算法 ; 论野生技术&二次元

觉得有点意思，拿出来和大家一起研究下，欢迎提供更好的建议。

因为爬x手的时候，网页已经没了，只能自食其力；用unrar(rarlab上下的，apt-get里的太古老)获得rar内的文件名，用zipfile模块列zip包的，抓的时候直接把返回的内容插到数据库了，没分析，因为没那么多时间去想算法，还是先把dirty data撸下来再说。

然后开始正文了www

按照这么一个基本思想，字幕文件名除了扩展名以外，其余部分和视频文件是一样的。如果压缩包里面只有一个文件，那么直接就是它去掉扩展名就好了；但是如果有多个版本的字幕（比如eng，GB，BIG5等），那就需要一个字符串最大匹配的算法。←为了装B取的名字

我是这么想的，首先需要一个最小单位来比较，不然一个一个字匹配，加上选择排列的时间复杂度，估计要跪；所以要减少最小单位的个数。因为大部分文件名用空格、“-”、“_”、“][”（二次元一般比较喜欢用中括号）。找一个能把文件名切割成最多的快的分隔符出来：

splt = '.-_ ]'
m_splt = max(splt, key = lambda x:sum(map(lambda l:len(l.split(x)), lst)))

1 2	splt = '.-_ ]' m_splt = max(splt, key = lambda x:sum(map(lambda l:len(l.split(x)), lst)))

分得越多当然就匹配的粒度更细嘛。

分割完之后，压缩包里的各个文件名都变成了一个个列表

然后就是每个单位做一个排列组合，如果有超过阈值的文件的某一个单位相同，则认为这是共同部分

def getEqual(l):
    cnt = len(l)
    equals = {}
    for i, j in itertools.combinations(l, 2):
        if not i or not j:
            continue
        if i == j:
            pass
        elif i.upper() == j.upper():
            i = i.upper()
        elif getCapital(i) == j or getCapital(j) == i:
            i = getCapital(i)
        else:
            continue
        #else i == j
        if i not in equals:
            equals[i] = 1
        else:
            equals[i] += 1
    if not equals:
        #print 'end'
        return False, ''
    m = max(equals.iteritems(), key = lambda x:x[1])
    _comb = cnt * (cnt -1) /2
    #print '***', m[1], _comb, m[1] > 0.3 * _comb
    if m[1] > 0.3 * _comb or m[1] == _comb:
        return True, m[0]
    else:
        return False, ''

def getEqual(l):

cnt = len(l)

equals = {}

for i, j in itertools.combinations(l, 2):

if not i or not j:

continue

if i == j:

pass

elif i.upper() == j.upper():

i = i.upper()

elif getCapital(i) == j or getCapital(j) == i:

i = getCapital(i)

else:

continue

#else i == j

if i not in equals:

equals[i] = 1

else:

equals[i] += 1

if not equals:

#print 'end'

return False, ''

m = max(equals.iteritems(), key = lambda x:x[1])

_comb = cnt * (cnt -1) /2

#print '***', m[1], _comb, m[1] > 0.3 * _comb

if m[1] > 0.3 * _comb or m[1] == _comb:

return True, m[0]

else:

return False, ''

getCaptital就是把第一个变成大写的函数。这是考虑到有些魂淡一会首字母大写一会首字母不大写造成的。不直接全部转小写再比较，是因为要尽量保持文件名的原始性，比如有些就是小写字母开头的名字，那不就坑爹了。

这里设置的阈值是30%的排列项一样就认为这个单位是共同部分。你觉得很低嘛，其实不低的呀，你想要是逗比字幕组在里面放一个招人.srt那不是傻掉了。

当然也要过滤扩展名。

啊呀好麻烦我不写了你们看代码吧

def getCommon(ori_lst, splt = '.-_ ]', with_no_digit = False):
    if with_no_digit:#replace off all digits, must be second time, so we don't strip ext name any more
        lst = map(lambda x:re.sub('\d+', '', x), ori_lst)
    else:
        lst = ['.'.join(x.split('.')[:-1]) for x in ori_lst if x and x[-4:] not in ('.txt', '.jpg', '.gif')]#strip ext name
    if len(lst) == 1:
        return lst[0]
    # judge which splitter gets most split
    m_splt = max(splt, key = lambda x:sum(map(lambda l:len(l.split(x)), lst)))
    def getEqual(l):
        cnt = len(l)
        equals = {}
        for i, j in itertools.combinations(l, 2):
            if not i or not j:
                continue
            if i == j:
                pass
            elif i.upper() == j.upper():
                i = i.upper()
            elif getCapital(i) == j or getCapital(j) == i:
                i = getCapital(i)
            else:
                continue
            #else i == j
            if i not in equals:
                equals[i] = 1
            else:
                equals[i] += 1
        if not equals:
            #print 'end'
            return False, ''
        m = max(equals.iteritems(), key = lambda x:x[1])
        _comb = cnt * (cnt -1) /2
        #print '***', m[1], _comb, m[1] > 0.3 * _comb
        if m[1] > 0.3 * _comb or m[1] == _comb:
            return True, m[0]
        else:
            return False, ''
    m_lst = map(lambda l:l.split(m_splt), lst)
    if not m_lst:
        return ''
    #print m_lst
    m_pattern = []
    for p in map(None, *m_lst):#add None to fillup short ones
        suc, new_pattern = getEqual(p)
        #print(suc, new_pattern)
        if suc:
            #print('new', new_pattern)
            m_pattern.append(new_pattern)
        else:
            break
    ret = m_splt.join(m_pattern) + (']' if m_splt == ']' else '')
    if not ret and not with_no_digit:#let's try strings without digits to get rid of "season" and "episode" difference
        return getCommon(lst, with_no_digit = True)#we pass prepared lst instead of ori_lst
    else:
        return ret

def getCommon(ori_lst, splt = '.-_ ]', with_no_digit = False):

if with_no_digit:#replace off all digits, must be second time, so we don't strip ext name any more

lst = map(lambda x:re.sub('\d+', '', x), ori_lst)

else:

lst = ['.'.join(x.split('.')[:-1]) for x in ori_lst if x and x[-4:] not in ('.txt', '.jpg', '.gif')]#strip ext name

if len(lst) == 1:

return lst[0]

# judge which splitter gets most split

m_splt = max(splt, key = lambda x:sum(map(lambda l:len(l.split(x)), lst)))

def getEqual(l):

cnt = len(l)

equals = {}

for i, j in itertools.combinations(l, 2):

if not i or not j:

continue

if i == j:

pass

elif i.upper() == j.upper():

i = i.upper()

elif getCapital(i) == j or getCapital(j) == i:

i = getCapital(i)

else:

continue

#else i == j

if i not in equals:

equals[i] = 1

else:

equals[i] += 1

if not equals:

#print 'end'

return False, ''

m = max(equals.iteritems(), key = lambda x:x[1])

_comb = cnt * (cnt -1) /2

#print '***', m[1], _comb, m[1] > 0.3 * _comb

if m[1] > 0.3 * _comb or m[1] == _comb:

return True, m[0]

else:

return False, ''

m_lst = map(lambda l:l.split(m_splt), lst)

if not m_lst:

return ''

#print m_lst

m_pattern = []

for p in map(None, *m_lst):#add None to fillup short ones

suc, new_pattern = getEqual(p)

#print(suc, new_pattern)

if suc:

#print('new', new_pattern)

m_pattern.append(new_pattern)

else:

break

ret = m_splt.join(m_pattern) + (']' if m_splt == ']' else '')

if not ret and not with_no_digit:#let's try strings without digits to get rid of "season" and "episode" difference

return getCommon(lst, with_no_digit = True)#we pass prepared lst instead of ori_lst

else:

return ret