Update sweetandshort scraper, some more corrections to prompt scraper

2024-02-04 13:08:51 +00:00 · 2024-02-04 13:08:51 +00:00 · 4915e8db4b
commit 4915e8db4b
parent 31f4a80ddc
1 changed files with 149 additions and 109 deletions
--- a/promptscrape.py
+++ b/promptscrape.py
@ -84,6 +84,8 @@ try:
    adstrippable = str(adtheprompt.text)
    while adstrippable[-1] == " ":
        adstrippable = adstrippable[:-1]
+    while adstrippable[0] == " ":
+        adstrippable = adstrippable[1:]
    print("anythingdrabble (100, 200, 300, 400, or 500 words): \033[1m" + adstrippable.lower() + "\033[0m (" + adprompt + ")\n")
    thefile.write("- [[" + adprompt + "][anythingdrabble]] (100, 200, 300, 400, or 500 words): *" + adstrippable.lower() + "*\n")
 except:
@ -124,6 +126,7 @@ try:
        zonetheprompt = zoneprompttext.find("strong")
        print("drabble-zone (100 or 200 words): \033[1m" + zonetheprompt.text.lower() + "\033[0m (" + zoneprompt + ")\n")
        thefile.write("- [[" + zoneprompt + "][drabble-zone]] (100 or 200 words): *" + zonetheprompt.text.lower() + "*\n")
+        
        emotion = "https://emotion100.dreamwidth.org/tag/*modpost?style=light&tag=%2Amodpost"
        emotionpage = s.get(emotion)
        emotionsoup = BeautifulSoup(emotionpage.content, "html.parser")
@ -146,7 +149,7 @@ try:
    ffa = "https://fail-fandomanon.dreamwidth.org/?style=light"
    ffapage = requests.get(ffa)
    ffasoup = BeautifulSoup(ffapage.content, "html.parser")
-    ffaprompts = ffasoup.find_all("h3", string=lambda text: "ffa dw post" in text.lower())
+    ffaprompts = ffasoup.find_all("h3")
    ffapromptstrim = [x for x in ffaprompts if "Placeholder" not in str(x)]
    ffasubsoup = BeautifulSoup(str(ffapromptstrim[0]), "html.parser")
    ffaurl = ffasubsoup.find("a")
@ -210,7 +213,7 @@ try:
    flash = "https://fan-flashworks.dreamwidth.org/?style=light&tag=admin"
    flashpage = requests.get(flash)
    flashsoup = BeautifulSoup(flashpage.content, "html.parser")
-    flashprompts = flashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
+    flashprompts = flashsoup.find_all(lambda tag: tag.name == "h3" and "Challenge" in tag.text)
    flashsubsoup = BeautifulSoup(str(flashprompts[0]), "html.parser")
    flashurl = flashsubsoup.find("a")
    flashprompt = (flashurl["href"])
@ -224,24 +227,25 @@ try:
 except:
    pass

-try:
-    femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges"
-    femslashpage = requests.get(femslash)
-    femslashsoup = BeautifulSoup(femslashpage.content, "html.parser")
-    femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
-    femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser")
-    femslashurl = femslashsubsoup.find("a")
-    femslashprompt = (femslashurl["href"])
-    femslashpromptnew = (femslashurl["href"] + "?style=light")
-    femslashpromptpage = requests.get(femslashpromptnew)
-    femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser")
-    femslashprompttext = femslashpromptsoup.find(class_="entry-content")
-    femslashtheprompt = femslashprompttext.find("i")
-    if femslashtheprompt is not None:
-        print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n")
-        thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n")
-except:
-    pass
+# seems dead
+# try:
+#     femslash = "https://femslashficlets.dreamwidth.org/tag/challenges?style=light&tag=challenges"
+#     femslashpage = requests.get(femslash)
+#     femslashsoup = BeautifulSoup(femslashpage.content, "html.parser")
+#     femslashprompts = femslashsoup.find_all("h3", string=lambda text: "challenge" in text.lower())
+#     femslashsubsoup = BeautifulSoup(str(femslashprompts[0]), "html.parser")
+#     femslashurl = femslashsubsoup.find("a")
+#     femslashprompt = (femslashurl["href"])
+#     femslashpromptnew = (femslashurl["href"] + "?style=light")
+#     femslashpromptpage = requests.get(femslashpromptnew)
+#     femslashpromptsoup = BeautifulSoup(femslashpromptpage.content, "html.parser")
+#     femslashprompttext = femslashpromptsoup.find(class_="entry-content")
+#     femslashtheprompt = femslashprompttext.find("i")
+#     if femslashtheprompt is not None:
+#         print("femslash-ficlets (100–1000 words, F/F): \033[1m" + femslashtheprompt.text.lower() + "\033[0m (" + femslashprompt + ")\n")
+#         thefile.write("- [[" + femslashprompt + "][femslashficlets]] (100 words or a multiple of 100): *" + femslashtheprompt.text.lower() + "*\n")
+# except:
+#     pass

 try:
    with requests.Session() as s:
@ -339,98 +343,134 @@ try:
 except:
    pass

-try:
-    if 30 > today > 21:
-        ssbingo = "https://sweetandshort.dreamwidth.org/tag/challenge:+bingo?style=light&tag=challenge:+bingo"
-        ssbingopage = requests.get(ssbingo)
-        ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser")
-        ssbingoprompts = ssbingosoup.find_all("h3")
-        ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser")
-        ssbingourl = ssbingosubsoup.find("a")
-        ssbingoprompt = (ssbingourl["href"])
-        ssbingopromptnew = (ssbingourl["href"] + "?style=light")
-        ssbingopromptpage = requests.get(ssbingopromptnew)
-        ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser")
-        ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content")
-        ssbingotheprompt = ssbingoprompttext.find_all("td")
-        ssbingoclean = []
-        for prompt in ssbingotheprompt:
-            newprompt = re.sub("<.*?>","",str(prompt))
-            ssbingoclean.append(newprompt)
-            ssbingofinal = "; ".join(ssbingoclean).lower()
-        print("sweet and short bingo (up to 300 words for two prompts, up to 600 words for four prompts): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n")
-        thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 300 words for two prompts, up to 600 words for four prompts): *" + ssbingofinal + "*\n")
-except:
-    pass
+# sweet and short: complex and time-depedent rules …
+# first need to work out which of the two alternating monthly challenges we're on

-try:
-    if 16 > today > 7:
-        ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky"
-        ssquickypage = requests.get(ssquicky)
-        ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser")
-        ssquickyprompts = ssquickysoup.find_all("h3")
-        ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser")
-        ssquickyurl = ssquickysubsoup.find("a")
-        ssquickyprompt = (ssquickyurl["href"])
-        # deliberately not using style=light here so we can get at the comment contents
-        ssquickypromptnew = (ssquickyurl["href"])
-        ssquickypromptpage = requests.get(ssquickypromptnew)
-        ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser")
-        promptcatch = ".*New Prompts Here"
-        # ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch))
-        ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment")
-        ssquickycomments = []
-        for comment in ssquickytheprompt:
-            if re.search("New Prompts Here",str(comment)):
-                commenttext = re.findall(r"<div class=\"comment-content\".*?</div>",str(comment))
-                commentprompt = re.sub("<.*?>","",str(commenttext))
-                ssquickycomments.append(str(commentprompt)[2:-2])
-        ssquickycprompt = "; ".join(ssquickycomments)
-        print("sweet and short comment quicky (up to 99 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n")
-        thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 99 words): *" + ssquickycprompt.lower() + "*\n")
-except:
-    pass
+themonth = date.today().month
+thisyear = date.today().year
+if thisyear // 2:
+    if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11:
+        alternate = "comment"
+    else:
+        alternate = "picture"
+else:
+    if themonth == 1 or themonth == 3 or themonth == 6 or themonth == 9 or themonth == 11:
+        alternate = "picture"
+    else:
+        alternate = "comment"

-try:
-    ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20"
-    ssmonthlypage = requests.get(ssmonthly)
-    ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser")
-    ssmonthlyprompts = ssmonthlysoup.find_all("h3")
-    ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser")
-    ssmonthlyurl = ssmonthlysubsoup.find("a")
-    ssmonthlyprompt = (ssmonthlyurl["href"])
-    ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light")
-    ssmonthlypromptpage = requests.get(ssmonthlypromptnew)
-    ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser")
-    ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content")
-    ssmonthlypromptmedian = re.findall(r"<a name=\"cutid1\">.*", str(ssmonthlyprompttext))
-    ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian))
-    ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone))
-    ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo))
-    ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2]
-    print("sweet and short monthly prompts (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n")
-    thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 300 words [0–9 prompts], up to 900 words [10–19 prompts], any [20 prompts]): *" + ssmonthlypromptfinal + "*\n")
-except:
-    pass
+if themonth != 4 and themonth != 8 and themonth != 12:
+    try:
+        if today > 21:
+            ssbingo = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+bingo?style=light&tag=!new+challenge,challenge:+bingo&mode=and"
+            ssbingopage = requests.get(ssbingo)
+            ssbingosoup = BeautifulSoup(ssbingopage.content, "html.parser")
+            ssbingoprompts = ssbingosoup.find_all("h3")
+            ssbingosubsoup = BeautifulSoup(str(ssbingoprompts[0]), "html.parser")
+            ssbingourl = ssbingosubsoup.find("a")
+            ssbingoprompt = (ssbingourl["href"])
+            ssbingopromptnew = (ssbingourl["href"] + "?style=light")
+            ssbingopromptpage = requests.get(ssbingopromptnew)
+            ssbingopromptsoup = BeautifulSoup(ssbingopromptpage.content, "html.parser")
+            ssbingoprompttext = ssbingopromptsoup.find(class_="entry-content")
+            ssbingotheprompt = ssbingoprompttext.find_all("td")
+            ssbingoclean = []
+            for prompt in ssbingotheprompt:
+                newprompt = re.sub("<.*?>","",str(prompt))
+                ssbingoclean.append(newprompt)
+                ssbingofinal = "; ".join(ssbingoclean).lower()
+            print("sweet and short bingo (up to 500 words, separate or combined): \033[1m" + ssbingofinal + "\033[0m (" + ssbingoprompt + ")\n")
+            thefile.write("- [[" + ssbingoprompt + "][sweet and short bingo]] (up to 500 words, separate or combined): *" + ssbingofinal + "*\n")
+    except:
+        pass

-try:
-    if today > 14:
-        sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun"
-        sspicturepage = requests.get(sspicture)
-        sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser")
-        monthstring = ".*" + month + ".*"
-        sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring))
-        sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser")
-        sspictureurl = sspicturesubsoup.find("a")
-        sspictureprompt = (sspictureurl["href"])
-        sspicturepromptnew = (sspictureurl["href"] + "?style=light")
-        sspicturepromptpage = requests.get(sspicturepromptnew)
-        sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser")
-        sspictureprompttext = sspicturepromptsoup.find("h3")
-        print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n")
-        thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n")
-except:
-    pass
+    try:
+        if today > 7:
+            if alternate == "comment":
+                ssquicky = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+comment+quicky?mode=and&style=light&tag=%21new+challenge,challenge:+comment+quicky"
+                ssquickypage = requests.get(ssquicky)
+                ssquickysoup = BeautifulSoup(ssquickypage.content, "html.parser")
+                ssquickyprompts = ssquickysoup.find_all("h3")
+                ssquickysubsoup = BeautifulSoup(str(ssquickyprompts[0]), "html.parser")
+                ssquickyurl = ssquickysubsoup.find("a")
+                ssquickyprompt = (ssquickyurl["href"])
+                # deliberately not using style=light here so we can get at the comment contents
+                ssquickypromptnew = (ssquickyurl["href"])
+                ssquickypromptpage = requests.get(ssquickypromptnew)
+                ssquickypromptsoup = BeautifulSoup(ssquickypromptpage.content, "html.parser")
+                promptcatch = ".*New Prompts Here"
+                # ssquickytheprompt = ssquickypromptsoup.find_all("h4",string = re.compile(promptcatch))
+                ssquickytheprompt = ssquickypromptsoup.find_all(class_="comment")
+                ssquickycomments = []
+                for comment in ssquickytheprompt:
+                    if re.search("New Prompts Here",str(comment)):
+                        commenttext = re.findall(r"<div class=\"comment-content\".*?</div>",str(comment))
+                        commentprompt = re.sub("<.*?>","",str(commenttext))
+                        ssquickycomments.append(str(commentprompt)[2:-2])
+                ssquickycprompt = "; ".join(ssquickycomments)
+                print("sweet and short comment quicky (up to 100 words): \033[1m" + ssquickycprompt.lower() + "\033[0m (" + ssquickyprompt + ")\n")
+                thefile.write("- [[" + ssquickyprompt + "][sweet and short comment quicky]] (up to 100 words): *" + ssquickycprompt.lower() + "*\n")
+            elif alternate == "picture":
+                sspicture = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+picture+prompt+fun?mode=and&style=light&tag=%21new+challenge,challenge:+picture+prompt+fun&mode=and"
+                sspicturepage = requests.get(sspicture)
+                sspicturesoup = BeautifulSoup(sspicturepage.content, "html.parser")
+                monthstring = ".*" + month + ".*"
+                sspictureprompts = sspicturesoup.find_all("h3", string=re.compile(monthstring))
+                sspicturesubsoup = BeautifulSoup(str(sspictureprompts[0]), "html.parser")
+                sspictureurl = sspicturesubsoup.find("a")
+                sspictureprompt = (sspictureurl["href"])
+                sspicturepromptnew = (sspictureurl["href"] + "?style=light")
+                sspicturepromptpage = requests.get(sspicturepromptnew)
+                sspicturepromptsoup = BeautifulSoup(sspicturepromptpage.content, "html.parser")
+                sspictureprompttext = sspicturepromptsoup.find("h3")
+                print("sweet and short picture prompts (up to 300 words): \033[1m" + sspictureprompttext.text.lower() + "\033[0m (" + sspictureprompt + ")\n")
+                thefile.write("- [[" + sspictureprompt + "][sweet and short picture prompts]] (up to 300 words): *" + sspictureprompttext.text.lower() + "*\n")
+    except:
+        pass
+
+    try:
+        ssmonthly = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+10+out+of+20?mode=and&style=light&tag=%21new+challenge,challenge:+10+out+of+20&mode=and"
+        ssmonthlypage = requests.get(ssmonthly)
+        ssmonthlysoup = BeautifulSoup(ssmonthlypage.content, "html.parser")
+        ssmonthlyprompts = ssmonthlysoup.find_all("h3")
+        ssmonthlysubsoup = BeautifulSoup(str(ssmonthlyprompts[0]), "html.parser")
+        ssmonthlyurl = ssmonthlysubsoup.find("a")
+        ssmonthlyprompt = (ssmonthlyurl["href"])
+        ssmonthlypromptnew = (ssmonthlyurl["href"] + "?style=light")
+        ssmonthlypromptpage = requests.get(ssmonthlypromptnew)
+        ssmonthlypromptsoup = BeautifulSoup(ssmonthlypromptpage.content, "html.parser")
+        ssmonthlyprompttext = ssmonthlypromptsoup.find(class_="entry-content")
+        ssmonthlypromptmedian = re.findall(r"<a name=\"cutid1\">.*", str(ssmonthlyprompttext))
+        ssmonthlypromptstripone = re.sub("<.*?>","",str(ssmonthlypromptmedian))
+        ssmonthlypromptstriptwo = re.sub("([a-z])- ","\\1; ",str(ssmonthlypromptstripone))
+        ssmonthlypromptstripthree = re.sub("- ","",str(ssmonthlypromptstriptwo))
+        ssmonthlypromptfinal = str(ssmonthlypromptstripthree)[2:-2]
+        print("sweet and short monthly prompts (up to 500 words based on at least 10 prompts): \033[1m" + ssmonthlypromptfinal + "\033[0m (" + ssmonthlyprompt + ")\n")
+        thefile.write("- [[" + ssmonthlyprompt + "][sweet and short monthly prompts]] (up to 500 words based on at least 10 prompts): *" + ssmonthlypromptfinal + "*\n")
+    except:
+        pass
+
+    try:
+        if today > 14:
+            ssone = "https://sweetandshort.dreamwidth.org/tag/!new+challenge,challenge:+only+one?mode=and&style=light&tag=%21new+challenge,challenge:+only+one&mode=and"
+            ssonepage = requests.get(ssone)
+            ssonesoup = BeautifulSoup(ssonepage.content, "html.parser")
+            ssoneprompts = ssonesoup.find_all("h3")
+            ssonesubsoup = BeautifulSoup(str(ssoneprompts[0]), "html.parser")
+            ssoneurl = ssonesubsoup.find("a")
+            ssoneprompt = (ssoneurl["href"])
+            ssonepromptnew = (ssoneurl["href"] + "?style=light")
+            ssonepromptpage = requests.get(ssonepromptnew)
+            ssonepromptsoup = BeautifulSoup(ssonepromptpage.content, "html.parser")
+            ssoneprompttext = ssonepromptsoup.find("i")
+            ssonepromptstripone = re.sub("<.*?>","",str(ssoneprompttext))
+            ssonepromptstriptwo = re.sub("1. ","",ssonepromptstripone)
+            ssonepromptfinal = re.sub("2. ","; ",ssonepromptstriptwo)
+            print("sweet and short one sentence (up to 500 words, use one or both lines as the start and/or end): \033[1m" + ssonepromptfinal + "\033[0m (" + ssoneprompt + ")\n")
+            thefile.write("- [[" + ssoneprompt + "][sweet and short one sentence]] (up to 500 words, use one or both lines as the start and/or end): *" + ssonepromptfinal + "*\n")
+
+    except:
+        pass

 try:
    vocab = "https://vocab-drabbles.dreamwidth.org/?style=light&tag=challenge"