February 23, 2019
I wanted to create docs for the course I enrolled which would help me for future reference, the course offers subtitle and I don’t wanted to search through subtitle then copy-paste content for docs so I decided to convert those subtitle files into text files and then copy whole text into mkdocs (which is the my reference documentation).
Code:
""" | |
Creates readable text file from SRT file. | |
""" | |
import re, sys | |
import os | |
def clean_up(lines): | |
""" | |
Get rid of all non-text lines and | |
try to combine text broken into multiple lines | |
""" | |
srt_count = 1 | |
new_lines = [] | |
escape_timestamp = False | |
for line in lines: | |
if str(srt_count) == line.strip(): | |
srt_count += 1 | |
escape_timestamp = True | |
continue | |
elif escape_timestamp: | |
escape_timestamp = False | |
elif line == '\n': | |
continue | |
else: | |
new_lines.append(line) | |
return new_lines | |
def main(args): | |
""" | |
Loops through the folder and creates extracted folder and copies content of file | |
""" | |
for (dirpath, dirnames, filenames) in os.walk('/Users/vinkrish/Documents/Intro to Statistics Subtitles'): | |
headDir, tailDir = os.path.split(dirpath) | |
newDir = '/Users/vinkrish/Documents/extracted/' + tailDir | |
if not os.path.exists(newDir): | |
try: | |
os.mkdir(newDir) | |
except OSError: | |
print ("Creation of the directory failed") | |
for file_name in filenames: | |
head, tail = os.path.split(file_name) | |
# os.rename(dirpath + "/" + file_name, dirpath + "/" + os.path.splitext(file_name)[0] + '.txt') | |
if file_name != '.DS_Store.txt': | |
print(file_name) | |
with open(dirpath + "/" + file_name) as f: | |
lines = f.readlines() | |
print(len(lines)) | |
new_lines = clean_up(lines) | |
new_file_name = newDir + "/" + tail | |
with open(new_file_name, 'w') as newFile: | |
for line in new_lines: | |
newFile.write(line) | |
if __name__ == '__main__': | |
main(sys.argv) | |
""" | |
NOTES | |
* Run from command line as | |
** python subtitle-extract.py | |
* Creates file_name.txt with extracted text from file_name.srt | |
""" |