Home

Extract Text from Subtitle

February 23, 2019

I wanted to create docs for the course I enrolled which would help me for future reference, the course offers subtitle and I don’t wanted to search through subtitle then copy-paste content for docs so I decided to convert those subtitle files into text files and then copy whole text into mkdocs (which is the my reference documentation).

Code:

"""
Creates readable text file from SRT file.
"""
import re, sys
import os
def clean_up(lines):
"""
Get rid of all non-text lines and
try to combine text broken into multiple lines
"""
srt_count = 1
new_lines = []
escape_timestamp = False
for line in lines:
if str(srt_count) == line.strip():
srt_count += 1
escape_timestamp = True
continue
elif escape_timestamp:
escape_timestamp = False
elif line == '\n':
continue
else:
new_lines.append(line)
return new_lines
def main(args):
"""
Loops through the folder and creates extracted folder and copies content of file
"""
for (dirpath, dirnames, filenames) in os.walk('/Users/vinkrish/Documents/Intro to Statistics Subtitles'):
headDir, tailDir = os.path.split(dirpath)
newDir = '/Users/vinkrish/Documents/extracted/' + tailDir
if not os.path.exists(newDir):
try:
os.mkdir(newDir)
except OSError:
print ("Creation of the directory failed")
for file_name in filenames:
head, tail = os.path.split(file_name)
# os.rename(dirpath + "/" + file_name, dirpath + "/" + os.path.splitext(file_name)[0] + '.txt')
if file_name != '.DS_Store.txt':
print(file_name)
with open(dirpath + "/" + file_name) as f:
lines = f.readlines()
print(len(lines))
new_lines = clean_up(lines)
new_file_name = newDir + "/" + tail
with open(new_file_name, 'w') as newFile:
for line in new_lines:
newFile.write(line)
if __name__ == '__main__':
main(sys.argv)
"""
NOTES
* Run from command line as
** python subtitle-extract.py
* Creates file_name.txt with extracted text from file_name.srt
"""


Hi, I'm Vinay - a Programmer who "Read-Evaluate-Loop". You can find me on GitHub LinkedIn Twitter


© 2024, Built with Gatsby