From 4915e8db4b35e8b85b13dc93525fb8cbd4d43b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?tr=C3=A9meur?= Date: Sun, 4 Feb 2024 13:08:51 +0000 Subject: [PATCH] Update sweetandshort scraper, some more corrections to prompt scraper --- promptscrape.py | 258 ++++++++++++++++++++++++++++-------------------- 1 file changed, 149 insertions(+), 109 deletions(-) diff --git a/promptscrape.py b/promptscrape.py index 078a23b..b4000d6 100644 --- a/promptscrape.py +++ b/promptscrape.py @@ -84,6 +84,8 @@ try: adstrippable = str(adtheprompt.text) while adstrippable[-1] == " ": adstrippable = adstrippable[:-1] + while adstrippable[0] == " ": + adstrippable = adstrippable[1:] print("anythingdrabble (100, 200, 300, 400, or 500 words): \033[1m" + adstrippable.lower() + "\033[0m (" + adprompt + ")\n") thefile.write("- [[" + adprompt + "][anythingdrabble]] (100, 200, 300, 400, or 500 words): *" + adstrippable.lower() + "*\n") except: @@ -124,6 +126,7 @@ try: zonetheprompt = zoneprompttext.find("strong") print("drabble-zone (100 or 200 words): \033[1m" + zonetheprompt.text.lower() + "\033[0m (" + zoneprompt + ")\n") thefile.write("- [[" + zoneprompt + "][drabble-zone]] (100 or 200 words): *" + zonetheprompt.text.lower() + "*\n") + emotion = "https://emotion100.dreamwidth.org/tag/*modpost?style=light&tag=%2Amodpost" emotionpage = s.get(emotion) emotionsoup = BeautifulSoup(emotionpage.content, "html.parser") @@ -146,7 +149,7 @@ try: ffa = "https://fail-fandomanon.dreamwidth.org/?style=light" ffapage = requests.get(ffa) ffasoup = BeautifulSoup(ffapage.content, "html.parser") - ffaprompts = ffasoup.find_all("h3", string=lambda text: "ffa dw post" in text.lower()) + ffaprompts = ffasoup.find_all("h3") ffapromptstrim = [x for x in ffaprompts if "Placeholder" not in str(x)] ffasubsoup = BeautifulSoup(str(ffapromptstrim[0]), "html.parser") ffaurl = ffasubsoup.find("a") @@ -210,7 +213,7 @@ try: flash = "https://fan-flashworks.dreamwidth.org/?style=light&tag=admin" flashpage = requests.get(flash) flashsoup = BeautifulSoup(flashpage.content, "html.parser") - flashprompts = flashsoup.find_all("h3", string=lambda text: "challenge" in text.lower()) + flashprompts = flashsoup.find_all(lambda tag: tag.name == "h3" and "Challenge" in tag.text) flashsubsoup = BeautifulSoup(str(flashprompts[0]), "html.parser") flashurl = flashsubsoup.find("a") flashprompt = (flashurl["href"]) @@ -224,24 +227,25 @@ try: except: pass -try: - femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges" - femslashpage = requests.get(femslash) - femslashsoup = BeautifulSoup(femslashpage.content, "html.parser") - femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower()) - femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser") - femslashurl = femslashsubsoup.find("a") - femslashprompt = (femslashurl["href"]) - femslashpromptnew = (femslashurl["href"] + "?style=light") - femslashpromptpage = requests.get(femslashpromptnew) - femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser") - femslashprompttext = femslashpromptsoup.find(class_="entry-content") - femslashtheprompt = femslashprompttext.find("i") - if femslashtheprompt is not None: - print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n") - thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n") -except: - pass +# seems dead +# try: +# femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges" +# femslashpage = requests.get(femslash) +# femslashsoup = BeautifulSoup(femslashpage.content, "html.parser") +# femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower()) +# femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser") +# femslashurl = femslashsubsoup.find("a") +# femslashprompt = (femslashurl["href"]) +# femslashpromptnew = (femslashurl["href"] + "?style=light") +# femslashpromptpage = requests.get(femslashpromptnew) +# femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser") +# femslashprompttext = femslashpromptsoup.find(class_="entry-content") +# femslashtheprompt = femslashprompttext.find("i") +# if femslashtheprompt is not None: +# print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n") +# thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n") +# except: +# pass try: with requests.Session() as s: @@ -339,98 +343,134 @@ try: except: pass -try: - if 30 > today > 21: - ssbingo = "https://sweetandshort.dreamwidth.org/tag/challenge:+bingo?style=light&tag=challenge:+bingo" - ssbingopage = requests.get(ssbingo) - ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser") - ssbingoprompts = ssbingosoup.find_all("h3") - ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser") - ssbingourl = ssbingosubsoup.find("a") - ssbingoprompt = (ssbingourl["href"]) - ssbingopromptnew = (ssbingourl["href"] + "?style=light") - ssbingopromptpage = requests.get(ssbingopromptnew) - ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser") - ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content") - ssbingotheprompt = ssbingoprompttext.find_all("td") - ssbingoclean = [] - for prompt in ssbingotheprompt: - newprompt = re.sub("<.*?>","",str(prompt)) - ssbingoclean.append(newprompt) - ssbingofinal = "; ".join(ssbingoclean).lower() - print("sweet and short bingo (up to 300 words for two prompts, up to 600 words for four prompts): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n") - thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 300 words for two prompts, up to 600 words for four prompts): *" + ssbingofinal + "*\n") -except: - pass +# sweet and short: complex and time-depedent rules … +# first need to work out which of the two alternating monthly challenges we're on -try: - if 16 > today > 7: - ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky" - ssquickypage = requests.get(ssquicky) - ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser") - ssquickyprompts = ssquickysoup.find_all("h3") - ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser") - ssquickyurl = ssquickysubsoup.find("a") - ssquickyprompt = (ssquickyurl["href"]) - # deliberately not using style=light here so we can get at the comment contents - ssquickypromptnew = (ssquickyurl["href"]) - ssquickypromptpage = requests.get(ssquickypromptnew) - ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser") - promptcatch = ".*New Prompts Here" - # ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch)) - ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment") - ssquickycomments = [] - for comment in ssquickytheprompt: - if re.search("New Prompts Here",str(comment)): - commenttext = re.findall(r"
",str(comment)) - commentprompt = re.sub("<.*?>","",str(commenttext)) - ssquickycomments.append(str(commentprompt)[2:-2]) - ssquickycprompt = "; ".join(ssquickycomments) - print("sweet and short comment quicky (up to 99 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n") - thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 99 words): *" + ssquickycprompt.lower() + "*\n") -except: - pass +themonth = date.today().month +thisyear = date.today().year +if thisyear // 2: + if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11: + alternate = "comment" + else: + alternate = "picture" +else: + if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11: + alternate = "picture" + else: + alternate = "comment" -try: - ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20" - ssmonthlypage = requests.get(ssmonthly) - ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser") - ssmonthlyprompts = ssmonthlysoup.find_all("h3") - ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser") - ssmonthlyurl = ssmonthlysubsoup.find("a") - ssmonthlyprompt = (ssmonthlyurl["href"]) - ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light") - ssmonthlypromptpage = requests.get(ssmonthlypromptnew) - ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser") - ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content") - ssmonthlypromptmedian = re.findall(r".*", str(ssmonthlyprompttext)) - ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian)) - ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone)) - ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo)) - ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2] - print("sweet and short monthly prompts (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n") - thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): *" + ssmonthlypromptfinal + "*\n") -except: - pass +if themonth != 4 and themonth != 8 and themonth != 12: + try: + if today > 21: + ssbingo = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+bingo?style=light&tag=!new+challenge,challenge:+bingo&mode=and" + ssbingopage = requests.get(ssbingo) + ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser") + ssbingoprompts = ssbingosoup.find_all("h3") + ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser") + ssbingourl = ssbingosubsoup.find("a") + ssbingoprompt = (ssbingourl["href"]) + ssbingopromptnew = (ssbingourl["href"] + "?style=light") + ssbingopromptpage = requests.get(ssbingopromptnew) + ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser") + ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content") + ssbingotheprompt = ssbingoprompttext.find_all("td") + ssbingoclean = [] + for prompt in ssbingotheprompt: + newprompt = re.sub("<.*?>","",str(prompt)) + ssbingoclean.append(newprompt) + ssbingofinal = "; ".join(ssbingoclean).lower() + print("sweet and short bingo (up to 500 words, separate or combined): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n") + thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 500 words, separate or combined): *" + ssbingofinal + "*\n") + except: + pass -try: - if today > 14: - sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun" - sspicturepage = requests.get(sspicture) - sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser") - monthstring = ".*" + month + ".*" - sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring)) - sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser") - sspictureurl = sspicturesubsoup.find("a") - sspictureprompt = (sspictureurl["href"]) - sspicturepromptnew = (sspictureurl["href"] + "?style=light") - sspicturepromptpage = requests.get(sspicturepromptnew) - sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser") - sspictureprompttext = sspicturepromptsoup.find("h3") - print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n") - thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n") -except: - pass + try: + if today > 7: + if alternate == "comment": + ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky" + ssquickypage = requests.get(ssquicky) + ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser") + ssquickyprompts = ssquickysoup.find_all("h3") + ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser") + ssquickyurl = ssquickysubsoup.find("a") + ssquickyprompt = (ssquickyurl["href"]) + # deliberately not using style=light here so we can get at the comment contents + ssquickypromptnew = (ssquickyurl["href"]) + ssquickypromptpage = requests.get(ssquickypromptnew) + ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser") + promptcatch = ".*New Prompts Here" + # ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch)) + ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment") + ssquickycomments = [] + for comment in ssquickytheprompt: + if re.search("New Prompts Here",str(comment)): + commenttext = re.findall(r"
",str(comment)) + commentprompt = re.sub("<.*?>","",str(commenttext)) + ssquickycomments.append(str(commentprompt)[2:-2]) + ssquickycprompt = "; ".join(ssquickycomments) + print("sweet and short comment quicky (up to 100 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n") + thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 100 words): *" + ssquickycprompt.lower() + "*\n") + elif alternate == "picture": + sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun&mode=and" + sspicturepage = requests.get(sspicture) + sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser") + monthstring = ".*" + month + ".*" + sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring)) + sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser") + sspictureurl = sspicturesubsoup.find("a") + sspictureprompt = (sspictureurl["href"]) + sspicturepromptnew = (sspictureurl["href"] + "?style=light") + sspicturepromptpage = requests.get(sspicturepromptnew) + sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser") + sspictureprompttext = sspicturepromptsoup.find("h3") + print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n") + thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n") + except: + pass + + try: + ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20&mode=and" + ssmonthlypage = requests.get(ssmonthly) + ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser") + ssmonthlyprompts = ssmonthlysoup.find_all("h3") + ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser") + ssmonthlyurl = ssmonthlysubsoup.find("a") + ssmonthlyprompt = (ssmonthlyurl["href"]) + ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light") + ssmonthlypromptpage = requests.get(ssmonthlypromptnew) + ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser") + ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content") + ssmonthlypromptmedian = re.findall(r".*", str(ssmonthlyprompttext)) + ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian)) + ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone)) + ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo)) + ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2] + print("sweet and short monthly prompts (up to 500 words based on at least 10 prompts): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n") + thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 500 words based on at least 10 prompts): *" + ssmonthlypromptfinal + "*\n") + except: + pass + + try: + if today > 14: + ssone = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+only+one?mode=and&style=light&tag=%21new+challenge,challenge:+only+one&mode=and" + ssonepage = requests.get(ssone) + ssonesoup = BeautifulSoup(ssonepage.content, "html.parser") + ssoneprompts = ssonesoup.find_all("h3") + ssonesubsoup = BeautifulSoup(str(ssoneprompts[0]), "html.parser") + ssoneurl = ssonesubsoup.find("a") + ssoneprompt = (ssoneurl["href"]) + ssonepromptnew = (ssoneurl["href"] + "?style=light") + ssonepromptpage = requests.get(ssonepromptnew) + ssonepromptsoup = BeautifulSoup(ssonepromptpage.content, "html.parser") + ssoneprompttext = ssonepromptsoup.find("i") + ssonepromptstripone = re.sub("<.*?>","",str(ssoneprompttext)) + ssonepromptstriptwo = re.sub("1. ","",ssonepromptstripone) + ssonepromptfinal = re.sub("2. ","; ",ssonepromptstriptwo) + print("sweet and short one sentence (up to 500 words, use one or both lines as the start and/or end): \033[1m" + ssonepromptfinal + "\033[0m (" + ssoneprompt + ")\n") + thefile.write("- [[" + ssoneprompt + "][sweet and short one sentence]] (up to 500 words, use one or both lines as the start and/or end): *" + ssonepromptfinal + "*\n") + + except: + pass try: vocab = "https://vocab-drabbles.dreamwidth.org/?style=light&tag=challenge"