Update sweetandshort scraper, some more corrections to prompt scraper

2024-02-04 13:08:51 +00:00 · 2024-02-04 13:08:51 +00:00 · 4915e8db4b
commit 4915e8db4b
parent 31f4a80ddc
1 changed files with 149 additions and 109 deletions
--- a/promptscrape.py
+++ b/promptscrape.py
@ -84,6 +84,8 @@ try:
    adstrippable = str(adtheprompt.text)
    while adstrippable[-1] == " ":
        adstrippable = adstrippable[:-1]
    while adstrippable[0] == " ":
        adstrippable = adstrippable[1:]
    print("anythingdrabble (100, 200, 300, 400, or 500 words): \033[1m" + adstrippable.lower() + "\033[0m (" + adprompt + ")\n")
    thefile.write("- [[" + adprompt + "][anythingdrabble]] (100, 200, 300, 400, or 500 words): *" + adstrippable.lower() + "*\n")
 except:
@ -124,6 +126,7 @@ try:
        zonetheprompt = zoneprompttext.find("strong")
        print("drabble-zone (100 or 200 words): \033[1m" + zonetheprompt.text.lower() + "\033[0m (" + zoneprompt + ")\n")
        thefile.write("- [[" + zoneprompt + "][drabble-zone]] (100 or 200 words): *" + zonetheprompt.text.lower() + "*\n")
        emotion = "https://emotion100.dreamwidth.org/tag/*modpost?style=light&tag=%2Amodpost"
        emotionpage = s.get(emotion)
        emotionsoup = BeautifulSoup(emotionpage.content, "html.parser")
@ -146,7 +149,7 @@ try:
    ffa = "https://fail-fandomanon.dreamwidth.org/?style=light"
    ffapage = requests.get(ffa)
    ffasoup = BeautifulSoup(ffapage.content, "html.parser")
-    ffaprompts = ffasoup.find_all("h3", string=lambda text: "ffa dw post" in text.lower())
+    ffaprompts = ffasoup.find_all("h3")
    ffapromptstrim = [x for x in ffaprompts if "Placeholder" not in str(x)]
    ffasubsoup = BeautifulSoup(str(ffapromptstrim[0]), "html.parser")
    ffaurl = ffasubsoup.find("a")
@ -210,7 +213,7 @@ try:
    flash = "https://fan-flashworks.dreamwidth.org/?style=light&tag=admin"
    flashpage = requests.get(flash)
    flashsoup = BeautifulSoup(flashpage.content, "html.parser")
-    flashprompts = flashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
+    flashprompts = flashsoup.find_all(lambda tag: tag.name == "h3" and "Challenge" in tag.text)
    flashsubsoup = BeautifulSoup(str(flashprompts[0]), "html.parser")
    flashurl = flashsubsoup.find("a")
    flashprompt = (flashurl["href"])
@ -224,24 +227,25 @@ try:
 except:
    pass
-try:
+# seems dead
-    femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges"
+# try:
-    femslashpage = requests.get(femslash)
+#     femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges"
-    femslashsoup = BeautifulSoup(femslashpage.content, "html.parser")
+#     femslashpage = requests.get(femslash)
-    femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
+#     femslashsoup = BeautifulSoup(femslashpage.content, "html.parser")
-    femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser")
+#     femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
-    femslashurl = femslashsubsoup.find("a")
+#     femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser")
-    femslashprompt = (femslashurl["href"])
+#     femslashurl = femslashsubsoup.find("a")
-    femslashpromptnew = (femslashurl["href"] + "?style=light")
+#     femslashprompt = (femslashurl["href"])
-    femslashpromptpage = requests.get(femslashpromptnew)
+#     femslashpromptnew = (femslashurl["href"] + "?style=light")
-    femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser")
+#     femslashpromptpage = requests.get(femslashpromptnew)
-    femslashprompttext = femslashpromptsoup.find(class_="entry-content")
+#     femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser")
-    femslashtheprompt = femslashprompttext.find("i")
+#     femslashprompttext = femslashpromptsoup.find(class_="entry-content")
-    if femslashtheprompt is not None:
+#     femslashtheprompt = femslashprompttext.find("i")
-        print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n")
+#     if femslashtheprompt is not None:
-        thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n")
+#         print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n")
-except:
+#         thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n")
-    pass
+# except:
 #     pass
 try:
    with requests.Session() as s:
@ -339,98 +343,134 @@ try:
 except:
    pass
-try:
+# sweet and short: complex and time-depedent rules …
-    if 30 > today > 21:
+# first need to work out which of the two alternating monthly challenges we're on
        ssbingo = "https://sweetandshort.dreamwidth.org/tag/challenge:+bingo?style=light&tag=challenge:+bingo"
        ssbingopage = requests.get(ssbingo)
        ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser")
        ssbingoprompts = ssbingosoup.find_all("h3")
        ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser")
        ssbingourl = ssbingosubsoup.find("a")
        ssbingoprompt = (ssbingourl["href"])
        ssbingopromptnew = (ssbingourl["href"] + "?style=light")
        ssbingopromptpage = requests.get(ssbingopromptnew)
        ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser")
        ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content")
        ssbingotheprompt = ssbingoprompttext.find_all("td")
        ssbingoclean = []
        for prompt in ssbingotheprompt:
            newprompt = re.sub("<.*?>","",str(prompt))
            ssbingoclean.append(newprompt)
            ssbingofinal = "; ".join(ssbingoclean).lower()
        print("sweet and short bingo (up to 300 words for two prompts, up to 600 words for four prompts): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n")
        thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 300 words for two prompts, up to 600 words for four prompts): *" + ssbingofinal + "*\n")
 except:
    pass
-try:
+themonth = date.today().month
-    if 16 > today > 7:
+thisyear = date.today().year
-        ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky"
+if thisyear // 2:
-        ssquickypage = requests.get(ssquicky)
+    if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11:
-        ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser")
+        alternate = "comment"
-        ssquickyprompts = ssquickysoup.find_all("h3")
+    else:
-        ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser")
+        alternate = "picture"
-        ssquickyurl = ssquickysubsoup.find("a")
+else:
-        ssquickyprompt = (ssquickyurl["href"])
+    if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11:
-        # deliberately not using style=light here so we can get at the comment contents
+        alternate = "picture"
-        ssquickypromptnew = (ssquickyurl["href"])
+    else:
-        ssquickypromptpage = requests.get(ssquickypromptnew)
+        alternate = "comment"
        ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser")
        promptcatch = ".*New Prompts Here"
        # ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch))
        ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment")
        ssquickycomments = []
        for comment in ssquickytheprompt:
            if re.search("New Prompts Here",str(comment)):
                commenttext = re.findall(r"<div class=\"comment-content\".*?</div>",str(comment))
                commentprompt = re.sub("<.*?>","",str(commenttext))
                ssquickycomments.append(str(commentprompt)[2:-2])
        ssquickycprompt = "; ".join(ssquickycomments)
        print("sweet and short comment quicky (up to 99 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n")
        thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 99 words): *" + ssquickycprompt.lower() + "*\n")
 except:
    pass
-try:
+if themonth != 4 and themonth != 8 and themonth != 12:
-    ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20"
+    try:
-    ssmonthlypage = requests.get(ssmonthly)
+        if today > 21:
-    ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser")
+            ssbingo = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+bingo?style=light&tag=!new+challenge,challenge:+bingo&mode=and"
-    ssmonthlyprompts = ssmonthlysoup.find_all("h3")
+            ssbingopage = requests.get(ssbingo)
-    ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser")
+            ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser")
-    ssmonthlyurl = ssmonthlysubsoup.find("a")
+            ssbingoprompts = ssbingosoup.find_all("h3")
-    ssmonthlyprompt = (ssmonthlyurl["href"])
+            ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser")
-    ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light")
+            ssbingourl = ssbingosubsoup.find("a")
-    ssmonthlypromptpage = requests.get(ssmonthlypromptnew)
+            ssbingoprompt = (ssbingourl["href"])
-    ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser")
+            ssbingopromptnew = (ssbingourl["href"] + "?style=light")
-    ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content")
+            ssbingopromptpage = requests.get(ssbingopromptnew)
-    ssmonthlypromptmedian = re.findall(r"<a name=\"cutid1\">.*", str(ssmonthlyprompttext))
+            ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser")
-    ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian))
+            ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content")
-    ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone))
+            ssbingotheprompt = ssbingoprompttext.find_all("td")
-    ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo))
+            ssbingoclean = []
-    ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2]
+            for prompt in ssbingotheprompt:
-    print("sweet and short monthly prompts (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n")
+                newprompt = re.sub("<.*?>","",str(prompt))
-    thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): *" + ssmonthlypromptfinal + "*\n")
+                ssbingoclean.append(newprompt)
-except:
+                ssbingofinal = "; ".join(ssbingoclean).lower()
-    pass
+            print("sweet and short bingo (up to 500 words, separate or combined): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n")
            thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 500 words, separate or combined): *" + ssbingofinal + "*\n")
    except:
        pass
-try:
+    try:
-    if today > 14:
+        if today > 7:
-        sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun"
+            if alternate == "comment":
-        sspicturepage = requests.get(sspicture)
+                ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky"
-        sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser")
+                ssquickypage = requests.get(ssquicky)
-        monthstring = ".*" + month + ".*"
+                ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser")
-        sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring))
+                ssquickyprompts = ssquickysoup.find_all("h3")
-        sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser")
+                ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser")
-        sspictureurl = sspicturesubsoup.find("a")
+                ssquickyurl = ssquickysubsoup.find("a")
-        sspictureprompt = (sspictureurl["href"])
+                ssquickyprompt = (ssquickyurl["href"])
-        sspicturepromptnew = (sspictureurl["href"] + "?style=light")
+                # deliberately not using style=light here so we can get at the comment contents
-        sspicturepromptpage = requests.get(sspicturepromptnew)
+                ssquickypromptnew = (ssquickyurl["href"])
-        sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser")
+                ssquickypromptpage = requests.get(ssquickypromptnew)
-        sspictureprompttext = sspicturepromptsoup.find("h3")
+                ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser")
-        print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n")
+                promptcatch = ".*New Prompts Here"
-        thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n")
+                # ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch))
-except:
+                ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment")
-    pass
+                ssquickycomments = []
                for comment in ssquickytheprompt:
                    if re.search("New Prompts Here",str(comment)):
                        commenttext = re.findall(r"<div class=\"comment-content\".*?</div>",str(comment))
                        commentprompt = re.sub("<.*?>","",str(commenttext))
                        ssquickycomments.append(str(commentprompt)[2:-2])
                ssquickycprompt = "; ".join(ssquickycomments)
                print("sweet and short comment quicky (up to 100 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n")
                thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 100 words): *" + ssquickycprompt.lower() + "*\n")
            elif alternate == "picture":
                sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun&mode=and"
                sspicturepage = requests.get(sspicture)
                sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser")
                monthstring = ".*" + month + ".*"
                sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring))
                sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser")
                sspictureurl = sspicturesubsoup.find("a")
                sspictureprompt = (sspictureurl["href"])
                sspicturepromptnew = (sspictureurl["href"] + "?style=light")
                sspicturepromptpage = requests.get(sspicturepromptnew)
                sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser")
                sspictureprompttext = sspicturepromptsoup.find("h3")
                print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n")
                thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n")
    except:
        pass
    try:
        ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20&mode=and"
        ssmonthlypage = requests.get(ssmonthly)
        ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser")
        ssmonthlyprompts = ssmonthlysoup.find_all("h3")
        ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser")
        ssmonthlyurl = ssmonthlysubsoup.find("a")
        ssmonthlyprompt = (ssmonthlyurl["href"])
        ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light")
        ssmonthlypromptpage = requests.get(ssmonthlypromptnew)
        ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser")
        ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content")
        ssmonthlypromptmedian = re.findall(r"<a name=\"cutid1\">.*", str(ssmonthlyprompttext))
        ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian))
        ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone))
        ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo))
        ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2]
        print("sweet and short monthly prompts (up to 500 words based on at least 10 prompts): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n")
        thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 500 words based on at least 10 prompts): *" + ssmonthlypromptfinal + "*\n")
    except:
        pass
    try:
        if today > 14:
            ssone = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+only+one?mode=and&style=light&tag=%21new+challenge,challenge:+only+one&mode=and"
            ssonepage = requests.get(ssone)
            ssonesoup = BeautifulSoup(ssonepage.content, "html.parser")
            ssoneprompts = ssonesoup.find_all("h3")
            ssonesubsoup = BeautifulSoup(str(ssoneprompts[0]), "html.parser")
            ssoneurl = ssonesubsoup.find("a")
            ssoneprompt = (ssoneurl["href"])
            ssonepromptnew = (ssoneurl["href"] + "?style=light")
            ssonepromptpage = requests.get(ssonepromptnew)
            ssonepromptsoup = BeautifulSoup(ssonepromptpage.content, "html.parser")
            ssoneprompttext = ssonepromptsoup.find("i")
            ssonepromptstripone = re.sub("<.*?>","",str(ssoneprompttext))
            ssonepromptstriptwo = re.sub("1. ","",ssonepromptstripone)
            ssonepromptfinal = re.sub("2. ","; ",ssonepromptstriptwo)
            print("sweet and short one sentence (up to 500 words, use one or both lines as the start and/or end): \033[1m" + ssonepromptfinal + "\033[0m (" + ssoneprompt + ")\n")
            thefile.write("- [[" + ssoneprompt + "][sweet and short one sentence]] (up to 500 words, use one or both lines as the start and/or end): *" + ssonepromptfinal + "*\n")
    except:
        pass
 try:
    vocab = "https://vocab-drabbles.dreamwidth.org/?style=light&tag=challenge"