Update sweetandshort scraper, some more corrections to prompt scraper
This commit is contained in:
parent
31f4a80ddc
commit
4915e8db4b
1 changed files with 149 additions and 109 deletions
258
promptscrape.py
258
promptscrape.py
|
@ -84,6 +84,8 @@ try:
|
|||
adstrippable = str(adtheprompt.text)
|
||||
while adstrippable[-1] == " ":
|
||||
adstrippable = adstrippable[:-1]
|
||||
while adstrippable[0] == " ":
|
||||
adstrippable = adstrippable[1:]
|
||||
print("anythingdrabble (100, 200, 300, 400, or 500 words): \033[1m" + adstrippable.lower() + "\033[0m (" + adprompt + ")\n")
|
||||
thefile.write("- [[" + adprompt + "][anythingdrabble]] (100, 200, 300, 400, or 500 words): *" + adstrippable.lower() + "*\n")
|
||||
except:
|
||||
|
@ -124,6 +126,7 @@ try:
|
|||
zonetheprompt = zoneprompttext.find("strong")
|
||||
print("drabble-zone (100 or 200 words): \033[1m" + zonetheprompt.text.lower() + "\033[0m (" + zoneprompt + ")\n")
|
||||
thefile.write("- [[" + zoneprompt + "][drabble-zone]] (100 or 200 words): *" + zonetheprompt.text.lower() + "*\n")
|
||||
|
||||
emotion = "https://emotion100.dreamwidth.org/tag/*modpost?style=light&tag=%2Amodpost"
|
||||
emotionpage = s.get(emotion)
|
||||
emotionsoup = BeautifulSoup(emotionpage.content, "html.parser")
|
||||
|
@ -146,7 +149,7 @@ try:
|
|||
ffa = "https://fail-fandomanon.dreamwidth.org/?style=light"
|
||||
ffapage = requests.get(ffa)
|
||||
ffasoup = BeautifulSoup(ffapage.content, "html.parser")
|
||||
ffaprompts = ffasoup.find_all("h3", string=lambda text: "ffa dw post" in text.lower())
|
||||
ffaprompts = ffasoup.find_all("h3")
|
||||
ffapromptstrim = [x for x in ffaprompts if "Placeholder" not in str(x)]
|
||||
ffasubsoup = BeautifulSoup(str(ffapromptstrim[0]), "html.parser")
|
||||
ffaurl = ffasubsoup.find("a")
|
||||
|
@ -210,7 +213,7 @@ try:
|
|||
flash = "https://fan-flashworks.dreamwidth.org/?style=light&tag=admin"
|
||||
flashpage = requests.get(flash)
|
||||
flashsoup = BeautifulSoup(flashpage.content, "html.parser")
|
||||
flashprompts = flashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
|
||||
flashprompts = flashsoup.find_all(lambda tag: tag.name == "h3" and "Challenge" in tag.text)
|
||||
flashsubsoup = BeautifulSoup(str(flashprompts[0]), "html.parser")
|
||||
flashurl = flashsubsoup.find("a")
|
||||
flashprompt = (flashurl["href"])
|
||||
|
@ -224,24 +227,25 @@ try:
|
|||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges"
|
||||
femslashpage = requests.get(femslash)
|
||||
femslashsoup = BeautifulSoup(femslashpage.content, "html.parser")
|
||||
femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
|
||||
femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser")
|
||||
femslashurl = femslashsubsoup.find("a")
|
||||
femslashprompt = (femslashurl["href"])
|
||||
femslashpromptnew = (femslashurl["href"] + "?style=light")
|
||||
femslashpromptpage = requests.get(femslashpromptnew)
|
||||
femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser")
|
||||
femslashprompttext = femslashpromptsoup.find(class_="entry-content")
|
||||
femslashtheprompt = femslashprompttext.find("i")
|
||||
if femslashtheprompt is not None:
|
||||
print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n")
|
||||
thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n")
|
||||
except:
|
||||
pass
|
||||
# seems dead
|
||||
# try:
|
||||
# femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges"
|
||||
# femslashpage = requests.get(femslash)
|
||||
# femslashsoup = BeautifulSoup(femslashpage.content, "html.parser")
|
||||
# femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
|
||||
# femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser")
|
||||
# femslashurl = femslashsubsoup.find("a")
|
||||
# femslashprompt = (femslashurl["href"])
|
||||
# femslashpromptnew = (femslashurl["href"] + "?style=light")
|
||||
# femslashpromptpage = requests.get(femslashpromptnew)
|
||||
# femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser")
|
||||
# femslashprompttext = femslashpromptsoup.find(class_="entry-content")
|
||||
# femslashtheprompt = femslashprompttext.find("i")
|
||||
# if femslashtheprompt is not None:
|
||||
# print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n")
|
||||
# thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n")
|
||||
# except:
|
||||
# pass
|
||||
|
||||
try:
|
||||
with requests.Session() as s:
|
||||
|
@ -339,98 +343,134 @@ try:
|
|||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
if 30 > today > 21:
|
||||
ssbingo = "https://sweetandshort.dreamwidth.org/tag/challenge:+bingo?style=light&tag=challenge:+bingo"
|
||||
ssbingopage = requests.get(ssbingo)
|
||||
ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser")
|
||||
ssbingoprompts = ssbingosoup.find_all("h3")
|
||||
ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser")
|
||||
ssbingourl = ssbingosubsoup.find("a")
|
||||
ssbingoprompt = (ssbingourl["href"])
|
||||
ssbingopromptnew = (ssbingourl["href"] + "?style=light")
|
||||
ssbingopromptpage = requests.get(ssbingopromptnew)
|
||||
ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser")
|
||||
ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content")
|
||||
ssbingotheprompt = ssbingoprompttext.find_all("td")
|
||||
ssbingoclean = []
|
||||
for prompt in ssbingotheprompt:
|
||||
newprompt = re.sub("<.*?>","",str(prompt))
|
||||
ssbingoclean.append(newprompt)
|
||||
ssbingofinal = "; ".join(ssbingoclean).lower()
|
||||
print("sweet and short bingo (up to 300 words for two prompts, up to 600 words for four prompts): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n")
|
||||
thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 300 words for two prompts, up to 600 words for four prompts): *" + ssbingofinal + "*\n")
|
||||
except:
|
||||
pass
|
||||
# sweet and short: complex and time-depedent rules …
|
||||
# first need to work out which of the two alternating monthly challenges we're on
|
||||
|
||||
try:
|
||||
if 16 > today > 7:
|
||||
ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky"
|
||||
ssquickypage = requests.get(ssquicky)
|
||||
ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser")
|
||||
ssquickyprompts = ssquickysoup.find_all("h3")
|
||||
ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser")
|
||||
ssquickyurl = ssquickysubsoup.find("a")
|
||||
ssquickyprompt = (ssquickyurl["href"])
|
||||
# deliberately not using style=light here so we can get at the comment contents
|
||||
ssquickypromptnew = (ssquickyurl["href"])
|
||||
ssquickypromptpage = requests.get(ssquickypromptnew)
|
||||
ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser")
|
||||
promptcatch = ".*New Prompts Here"
|
||||
# ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch))
|
||||
ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment")
|
||||
ssquickycomments = []
|
||||
for comment in ssquickytheprompt:
|
||||
if re.search("New Prompts Here",str(comment)):
|
||||
commenttext = re.findall(r"<div class=\"comment-content\".*?</div>",str(comment))
|
||||
commentprompt = re.sub("<.*?>","",str(commenttext))
|
||||
ssquickycomments.append(str(commentprompt)[2:-2])
|
||||
ssquickycprompt = "; ".join(ssquickycomments)
|
||||
print("sweet and short comment quicky (up to 99 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n")
|
||||
thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 99 words): *" + ssquickycprompt.lower() + "*\n")
|
||||
except:
|
||||
pass
|
||||
themonth = date.today().month
|
||||
thisyear = date.today().year
|
||||
if thisyear // 2:
|
||||
if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11:
|
||||
alternate = "comment"
|
||||
else:
|
||||
alternate = "picture"
|
||||
else:
|
||||
if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11:
|
||||
alternate = "picture"
|
||||
else:
|
||||
alternate = "comment"
|
||||
|
||||
try:
|
||||
ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20"
|
||||
ssmonthlypage = requests.get(ssmonthly)
|
||||
ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser")
|
||||
ssmonthlyprompts = ssmonthlysoup.find_all("h3")
|
||||
ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser")
|
||||
ssmonthlyurl = ssmonthlysubsoup.find("a")
|
||||
ssmonthlyprompt = (ssmonthlyurl["href"])
|
||||
ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light")
|
||||
ssmonthlypromptpage = requests.get(ssmonthlypromptnew)
|
||||
ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser")
|
||||
ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content")
|
||||
ssmonthlypromptmedian = re.findall(r"<a name=\"cutid1\">.*", str(ssmonthlyprompttext))
|
||||
ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian))
|
||||
ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone))
|
||||
ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo))
|
||||
ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2]
|
||||
print("sweet and short monthly prompts (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n")
|
||||
thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): *" + ssmonthlypromptfinal + "*\n")
|
||||
except:
|
||||
pass
|
||||
if themonth != 4 and themonth != 8 and themonth != 12:
|
||||
try:
|
||||
if today > 21:
|
||||
ssbingo = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+bingo?style=light&tag=!new+challenge,challenge:+bingo&mode=and"
|
||||
ssbingopage = requests.get(ssbingo)
|
||||
ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser")
|
||||
ssbingoprompts = ssbingosoup.find_all("h3")
|
||||
ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser")
|
||||
ssbingourl = ssbingosubsoup.find("a")
|
||||
ssbingoprompt = (ssbingourl["href"])
|
||||
ssbingopromptnew = (ssbingourl["href"] + "?style=light")
|
||||
ssbingopromptpage = requests.get(ssbingopromptnew)
|
||||
ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser")
|
||||
ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content")
|
||||
ssbingotheprompt = ssbingoprompttext.find_all("td")
|
||||
ssbingoclean = []
|
||||
for prompt in ssbingotheprompt:
|
||||
newprompt = re.sub("<.*?>","",str(prompt))
|
||||
ssbingoclean.append(newprompt)
|
||||
ssbingofinal = "; ".join(ssbingoclean).lower()
|
||||
print("sweet and short bingo (up to 500 words, separate or combined): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n")
|
||||
thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 500 words, separate or combined): *" + ssbingofinal + "*\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
if today > 14:
|
||||
sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun"
|
||||
sspicturepage = requests.get(sspicture)
|
||||
sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser")
|
||||
monthstring = ".*" + month + ".*"
|
||||
sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring))
|
||||
sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser")
|
||||
sspictureurl = sspicturesubsoup.find("a")
|
||||
sspictureprompt = (sspictureurl["href"])
|
||||
sspicturepromptnew = (sspictureurl["href"] + "?style=light")
|
||||
sspicturepromptpage = requests.get(sspicturepromptnew)
|
||||
sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser")
|
||||
sspictureprompttext = sspicturepromptsoup.find("h3")
|
||||
print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n")
|
||||
thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n")
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
if today > 7:
|
||||
if alternate == "comment":
|
||||
ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky"
|
||||
ssquickypage = requests.get(ssquicky)
|
||||
ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser")
|
||||
ssquickyprompts = ssquickysoup.find_all("h3")
|
||||
ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser")
|
||||
ssquickyurl = ssquickysubsoup.find("a")
|
||||
ssquickyprompt = (ssquickyurl["href"])
|
||||
# deliberately not using style=light here so we can get at the comment contents
|
||||
ssquickypromptnew = (ssquickyurl["href"])
|
||||
ssquickypromptpage = requests.get(ssquickypromptnew)
|
||||
ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser")
|
||||
promptcatch = ".*New Prompts Here"
|
||||
# ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch))
|
||||
ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment")
|
||||
ssquickycomments = []
|
||||
for comment in ssquickytheprompt:
|
||||
if re.search("New Prompts Here",str(comment)):
|
||||
commenttext = re.findall(r"<div class=\"comment-content\".*?</div>",str(comment))
|
||||
commentprompt = re.sub("<.*?>","",str(commenttext))
|
||||
ssquickycomments.append(str(commentprompt)[2:-2])
|
||||
ssquickycprompt = "; ".join(ssquickycomments)
|
||||
print("sweet and short comment quicky (up to 100 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n")
|
||||
thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 100 words): *" + ssquickycprompt.lower() + "*\n")
|
||||
elif alternate == "picture":
|
||||
sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun&mode=and"
|
||||
sspicturepage = requests.get(sspicture)
|
||||
sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser")
|
||||
monthstring = ".*" + month + ".*"
|
||||
sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring))
|
||||
sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser")
|
||||
sspictureurl = sspicturesubsoup.find("a")
|
||||
sspictureprompt = (sspictureurl["href"])
|
||||
sspicturepromptnew = (sspictureurl["href"] + "?style=light")
|
||||
sspicturepromptpage = requests.get(sspicturepromptnew)
|
||||
sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser")
|
||||
sspictureprompttext = sspicturepromptsoup.find("h3")
|
||||
print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n")
|
||||
thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20&mode=and"
|
||||
ssmonthlypage = requests.get(ssmonthly)
|
||||
ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser")
|
||||
ssmonthlyprompts = ssmonthlysoup.find_all("h3")
|
||||
ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser")
|
||||
ssmonthlyurl = ssmonthlysubsoup.find("a")
|
||||
ssmonthlyprompt = (ssmonthlyurl["href"])
|
||||
ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light")
|
||||
ssmonthlypromptpage = requests.get(ssmonthlypromptnew)
|
||||
ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser")
|
||||
ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content")
|
||||
ssmonthlypromptmedian = re.findall(r"<a name=\"cutid1\">.*", str(ssmonthlyprompttext))
|
||||
ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian))
|
||||
ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone))
|
||||
ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo))
|
||||
ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2]
|
||||
print("sweet and short monthly prompts (up to 500 words based on at least 10 prompts): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n")
|
||||
thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 500 words based on at least 10 prompts): *" + ssmonthlypromptfinal + "*\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
if today > 14:
|
||||
ssone = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+only+one?mode=and&style=light&tag=%21new+challenge,challenge:+only+one&mode=and"
|
||||
ssonepage = requests.get(ssone)
|
||||
ssonesoup = BeautifulSoup(ssonepage.content, "html.parser")
|
||||
ssoneprompts = ssonesoup.find_all("h3")
|
||||
ssonesubsoup = BeautifulSoup(str(ssoneprompts[0]), "html.parser")
|
||||
ssoneurl = ssonesubsoup.find("a")
|
||||
ssoneprompt = (ssoneurl["href"])
|
||||
ssonepromptnew = (ssoneurl["href"] + "?style=light")
|
||||
ssonepromptpage = requests.get(ssonepromptnew)
|
||||
ssonepromptsoup = BeautifulSoup(ssonepromptpage.content, "html.parser")
|
||||
ssoneprompttext = ssonepromptsoup.find("i")
|
||||
ssonepromptstripone = re.sub("<.*?>","",str(ssoneprompttext))
|
||||
ssonepromptstriptwo = re.sub("1. ","",ssonepromptstripone)
|
||||
ssonepromptfinal = re.sub("2. ","; ",ssonepromptstriptwo)
|
||||
print("sweet and short one sentence (up to 500 words, use one or both lines as the start and/or end): \033[1m" + ssonepromptfinal + "\033[0m (" + ssoneprompt + ")\n")
|
||||
thefile.write("- [[" + ssoneprompt + "][sweet and short one sentence]] (up to 500 words, use one or both lines as the start and/or end): *" + ssonepromptfinal + "*\n")
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
vocab = "https://vocab-drabbles.dreamwidth.org/?style=light&tag=challenge"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue