This repository has been archived on 2025-03-12. You can view files and clone it, but cannot push or open issues or pull requests.
AIDungeon/data/make_reddit_data.py

59 lines
1.5 KiB
Python
Raw Permalink Normal View History

2025-03-11 22:26:45 -04:00
import json
import os
from story.utils import *
def load_stories(file):
try:
with open(file) as fp:
stories = json.load(fp)
return stories
except:
with open(file) as fp:
stories = []
for line in fp:
if len(line) > 10:
story = json.loads(line)
stories.append(story)
return stories
def modify_story(story):
text = story["body"]
if len(text) < 100:
return None
first_person = is_first_person(text)
second_person = is_second_person(text)
if first_person or second_person:
return first_to_second_person(text)
else:
return None
current = os.getcwd()
files = os.listdir(current + "/writingprompts")
output_file_path = "writing_prompts.txt"
with open(output_file_path, "w") as output_file:
filenames = ["writingprompts/" + file for file in files]
cleaned_stories = []
for filename in filenames:
print("Processing file ", filename)
stories = load_stories(filename)
for story in stories:
cleaned_story = modify_story(story)
if cleaned_story is not None:
cleaned_stories.append(cleaned_story)
raw_text = ""
start_token = "<|startoftext|>"
end_token = "<|endoftext|>"
for story in cleaned_stories:
raw_text += start_token + story + end_token + "\n"
print(len(raw_text))
output_file.write(raw_text)