58 lines
1.5 KiB
Python
58 lines
1.5 KiB
Python
import json
|
|
import os
|
|
|
|
from story.utils import *
|
|
|
|
|
|
def load_stories(file):
|
|
|
|
try:
|
|
with open(file) as fp:
|
|
stories = json.load(fp)
|
|
return stories
|
|
except:
|
|
with open(file) as fp:
|
|
stories = []
|
|
for line in fp:
|
|
if len(line) > 10:
|
|
story = json.loads(line)
|
|
stories.append(story)
|
|
return stories
|
|
|
|
|
|
def modify_story(story):
|
|
|
|
text = story["body"]
|
|
if len(text) < 100:
|
|
return None
|
|
|
|
first_person = is_first_person(text)
|
|
second_person = is_second_person(text)
|
|
if first_person or second_person:
|
|
return first_to_second_person(text)
|
|
else:
|
|
return None
|
|
|
|
|
|
current = os.getcwd()
|
|
files = os.listdir(current + "/writingprompts")
|
|
output_file_path = "writing_prompts.txt"
|
|
with open(output_file_path, "w") as output_file:
|
|
filenames = ["writingprompts/" + file for file in files]
|
|
cleaned_stories = []
|
|
for filename in filenames:
|
|
print("Processing file ", filename)
|
|
stories = load_stories(filename)
|
|
for story in stories:
|
|
cleaned_story = modify_story(story)
|
|
if cleaned_story is not None:
|
|
cleaned_stories.append(cleaned_story)
|
|
|
|
raw_text = ""
|
|
start_token = "<|startoftext|>"
|
|
end_token = "<|endoftext|>"
|
|
for story in cleaned_stories:
|
|
raw_text += start_token + story + end_token + "\n"
|
|
print(len(raw_text))
|
|
|
|
output_file.write(raw_text)
|