276 lines
12 KiB
Python
276 lines
12 KiB
Python
|
import json
|
||
|
import time
|
||
|
|
||
|
from selenium import webdriver
|
||
|
from selenium.webdriver.chrome.options import Options
|
||
|
|
||
|
|
||
|
"""
|
||
|
format of tree is
|
||
|
dict {
|
||
|
tree_id: tree_id_text
|
||
|
context: context text?
|
||
|
first_story_block
|
||
|
action_results: [act_res1, act_res2, act_res3...]
|
||
|
}
|
||
|
|
||
|
where each action_result's format is:
|
||
|
dict{
|
||
|
action: action_text
|
||
|
result: result_text
|
||
|
action_results: [act_res1, act_res2, act_res3...]
|
||
|
}
|
||
|
"""
|
||
|
|
||
|
|
||
|
class Scraper:
|
||
|
def __init__(self):
|
||
|
chrome_options = Options()
|
||
|
chrome_options.add_argument("--binary=/path/to/other/chrome/binary")
|
||
|
chrome_options.add_argument("--incognito")
|
||
|
chrome_options.add_argument("--window-size=1920x1080")
|
||
|
exec_path = "/usr/bin/chromedriver"
|
||
|
self.driver = webdriver.Chrome(
|
||
|
chrome_options=chrome_options, executable_path=exec_path
|
||
|
)
|
||
|
self.max_depth = 10
|
||
|
self.end_actions = {
|
||
|
"End Game and Leave Comments",
|
||
|
"Click here to End the Game and Leave Comments",
|
||
|
"See How Well You Did (you can still back-page afterwards if you like)",
|
||
|
"You have died.",
|
||
|
"You have died",
|
||
|
"Epilogue",
|
||
|
"Save Game",
|
||
|
"Your quest might have been more successful...",
|
||
|
"5 - not the best, certainly not the worst",
|
||
|
"The End! (leave comments on game)",
|
||
|
"6 - it's worth every cent",
|
||
|
"You do not survive the journey to California",
|
||
|
"Quit the game.",
|
||
|
"7 - even better than Reeses' Cups®",
|
||
|
"8 - it will bring you enlightenment",
|
||
|
"End of game! Leave a comment!",
|
||
|
"Better luck next time",
|
||
|
"click here to continue",
|
||
|
"Rating And Leaving Comments",
|
||
|
"You do not survive your journey to California",
|
||
|
"Your Outlaw Career has come to an end",
|
||
|
"Thank you for taking the time to read my story",
|
||
|
"You have no further part in the story, End Game and Leave Comments",
|
||
|
"",
|
||
|
"You play no further part in this story. End Game and Leave Comments",
|
||
|
"drivers",
|
||
|
"Alas, poor Yorick, they slew you well",
|
||
|
"My heart bleeds for you",
|
||
|
"To End the Game and Leave Comments click here",
|
||
|
"Call it a day",
|
||
|
"Check the voicemail.",
|
||
|
"reset",
|
||
|
"There's nothing you can do anymore...it's over.",
|
||
|
"To Be Continued...",
|
||
|
"Thanks again for taking the time to read this",
|
||
|
"If you just want to escape this endless story you can do that by clicking here",
|
||
|
"Boo Hoo Hoo",
|
||
|
"End.",
|
||
|
"Pick up some money real quick",
|
||
|
"",
|
||
|
"Well you did live a decent amount of time in the Army",
|
||
|
"End Game",
|
||
|
"You have survived the Donner Party's journey to California!",
|
||
|
}
|
||
|
self.texts = set()
|
||
|
|
||
|
def GoToURL(self, url):
|
||
|
self.texts = set()
|
||
|
self.driver.get(url)
|
||
|
time.sleep(0.5)
|
||
|
|
||
|
def GetText(self):
|
||
|
div_elements = self.driver.find_elements_by_css_selector("div")
|
||
|
text = div_elements[3].text
|
||
|
return text
|
||
|
|
||
|
def GetLinks(self):
|
||
|
return self.driver.find_elements_by_css_selector("a")
|
||
|
|
||
|
def GoBack(self):
|
||
|
self.GetLinks()[0].click()
|
||
|
time.sleep(0.2)
|
||
|
|
||
|
def ClickAction(self, links, action_num):
|
||
|
links[action_num + 4].click()
|
||
|
time.sleep(0.2)
|
||
|
|
||
|
def GetActions(self):
|
||
|
return [link.text for link in self.GetLinks()[4:]]
|
||
|
|
||
|
def NumActions(self):
|
||
|
return len(self.GetLinks()) - 4
|
||
|
|
||
|
def BuildTreeHelper(self, parent_story, action_num, depth, old_actions):
|
||
|
depth += 1
|
||
|
action_result = {}
|
||
|
|
||
|
action = old_actions[action_num]
|
||
|
print("Action is ", repr(action))
|
||
|
action_result["action"] = action
|
||
|
|
||
|
links = self.GetLinks()
|
||
|
if action_num + 4 >= len(links):
|
||
|
return None
|
||
|
|
||
|
self.ClickAction(links, action_num)
|
||
|
result = self.GetText()
|
||
|
if result == parent_story or result in self.texts:
|
||
|
self.GoBack()
|
||
|
return None
|
||
|
|
||
|
self.texts.add(result)
|
||
|
print(len(self.texts))
|
||
|
|
||
|
action_result["result"] = result
|
||
|
|
||
|
actions = self.GetActions()
|
||
|
action_result["action_results"] = []
|
||
|
|
||
|
for i, action in enumerate(actions):
|
||
|
if actions[i] not in self.end_actions:
|
||
|
sub_action_result = self.BuildTreeHelper(result, i, depth, actions)
|
||
|
if action_result is not None:
|
||
|
action_result["action_results"].append(sub_action_result)
|
||
|
|
||
|
self.GoBack()
|
||
|
return action_result
|
||
|
|
||
|
def BuildStoryTree(self, url):
|
||
|
scraper.GoToURL(url)
|
||
|
text = scraper.GetText()
|
||
|
actions = self.GetActions()
|
||
|
story_dict = {}
|
||
|
story_dict["tree_id"] = url
|
||
|
story_dict["context"] = ""
|
||
|
story_dict["first_story_block"] = text
|
||
|
story_dict["action_results"] = []
|
||
|
|
||
|
for i, action in enumerate(actions):
|
||
|
if action not in self.end_actions:
|
||
|
action_result = self.BuildTreeHelper(text, i, 0, actions)
|
||
|
if action_result is not None:
|
||
|
story_dict["action_results"].append(action_result)
|
||
|
else:
|
||
|
print("done")
|
||
|
|
||
|
return story_dict
|
||
|
|
||
|
|
||
|
def save_tree(tree, filename):
|
||
|
with open(filename, "w") as fp:
|
||
|
json.dump(tree, fp)
|
||
|
|
||
|
|
||
|
scraper = Scraper()
|
||
|
|
||
|
urls = [
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10638",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=11246",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=54639",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7397",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8041",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=11545",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7393",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=13875",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=37696",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=31013",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=45375",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=41698",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10634",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=42204",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=6823",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=18988",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10359",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=5466",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=28030",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=56515",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7480",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=11274",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=53134",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=17306",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=470",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8041",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=23928",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10183",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=45866",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=60232",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=6376",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=36791",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=60128",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=52961",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=54011",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=34838",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=13349",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8038",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=56742",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=48393",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=53356",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10872",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7393",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=31013",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=43910",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=53837",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8098",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=55043",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=28838",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=11906",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8040",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=2280",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=31014",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=43744",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=44543",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=56753",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=36594",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=15424",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8035",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10524",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=14899",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=9361",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=28030",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=49642",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=43573",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=38025",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7480",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7567",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=60747",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=10359",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=31353",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=13875",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=56501",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=38542",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=42204",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=43993",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=1153",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=24743",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=57114",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=52887",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=21879",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=16489",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=53186",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=34849",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=26752",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=7094",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=8557",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=45225",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=4720",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=51926",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=45375",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=27234",
|
||
|
"http://chooseyourstory.com/story/viewer/default.aspx?StoryId=60772",
|
||
|
]
|
||
|
|
||
|
for i in range(50, len(urls)):
|
||
|
print("****** Extracting Adventure ", urls[i], " ***********")
|
||
|
tree = scraper.BuildStoryTree(urls[i])
|
||
|
save_tree(tree, "stories/story" + str(41 + i) + ".json")
|
||
|
|
||
|
print("done")
|