59 lines
1.5 KiB
Python
59 lines
1.5 KiB
Python
|
import json
|
||
|
import os
|
||
|
|
||
|
from story.utils import *
|
||
|
|
||
|
|
||
|
def load_stories(file):
|
||
|
|
||
|
try:
|
||
|
with open(file) as fp:
|
||
|
stories = json.load(fp)
|
||
|
return stories
|
||
|
except:
|
||
|
with open(file) as fp:
|
||
|
stories = []
|
||
|
for line in fp:
|
||
|
if len(line) > 10:
|
||
|
story = json.loads(line)
|
||
|
stories.append(story)
|
||
|
return stories
|
||
|
|
||
|
|
||
|
def modify_story(story):
|
||
|
|
||
|
text = story["body"]
|
||
|
if len(text) < 100:
|
||
|
return None
|
||
|
|
||
|
first_person = is_first_person(text)
|
||
|
second_person = is_second_person(text)
|
||
|
if first_person or second_person:
|
||
|
return first_to_second_person(text)
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
current = os.getcwd()
|
||
|
files = os.listdir(current + "/writingprompts")
|
||
|
output_file_path = "writing_prompts.txt"
|
||
|
with open(output_file_path, "w") as output_file:
|
||
|
filenames = ["writingprompts/" + file for file in files]
|
||
|
cleaned_stories = []
|
||
|
for filename in filenames:
|
||
|
print("Processing file ", filename)
|
||
|
stories = load_stories(filename)
|
||
|
for story in stories:
|
||
|
cleaned_story = modify_story(story)
|
||
|
if cleaned_story is not None:
|
||
|
cleaned_stories.append(cleaned_story)
|
||
|
|
||
|
raw_text = ""
|
||
|
start_token = "<|startoftext|>"
|
||
|
end_token = "<|endoftext|>"
|
||
|
for story in cleaned_stories:
|
||
|
raw_text += start_token + story + end_token + "\n"
|
||
|
print(len(raw_text))
|
||
|
|
||
|
output_file.write(raw_text)
|