-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
executable file
·1932 lines (1687 loc) · 94.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# TODO
# Fix replacement skill - Know.
# CSV gen
# Fix reddit people's assertion error
# Add utils to generate unique_leaves, counts, and lookup for a sub-dict of d in interactive mode
#####################
# CONFIG
#####################
from fractions import Fraction
import os
from copy import deepcopy
from tqdm import tqdm
import traceback
import sys
import json
import regex as re
from bs4 import BeautifulSoup, NavigableString
include3_5 = True
#####################
# PROGRAM
#####################
sizes = ['Fine', 'Diminutive', 'Tiny', 'Small', 'Medium', 'Large', 'Huge', 'Gargantuan', 'Colossal']
def parseInt(s, stringIfFail=False):
def _parseInt(s):
return int(s.strip().replace(",", "").replace("+ ", "+").replace("- ", "-"))
if stringIfFail:
try:
return _parseInt(s)
except:
return s.strip()
else:
return _parseInt(s)
def parsePage(html, url):
# Init the object we'll be building
pageObject = {}
# Clean up HTML
regex = r'(?:\r\n|\r|\xad|' + chr(10) + ')+' # Fix weird whitespace in some entries (e.g. Vermlek, Achaierai, Signifer of the Nail, Vampiric Mist)
html = re.sub(r'(?<=\s+)' + regex + r'|' + regex + r'(?=\s+)', r'', html) # First handle weird whitespace bordering regular whitespace - just delete it
html = re.sub(regex, r' ', html) # Then handle weird whitespace bordering non-whitespace - replace with a space
html = re.sub(r'(?<!<\s*br\s*>\s*)<\s*/\s*br\s*>', r'<br/>', html) # Fix broken <br> tags in some pages, e.g. Vilderavn. Uses a variable-width negative lookbehind, so we use the regex module instead of the re module
html = re.sub(r'<\s*/?\s*br\s*/?\s*>', r'<br/>', html) # Further fix messy <br>s, e.g. Fulgati, where <br/ > seems to cause problems
html = re.sub(r'[−—–‐‑‒―]|–|—', "-", html) # No reason to deal with all these different dashes
html = re.sub(r'’', r"'", html) # Fix dumb apostrophes like in Shaorhaz, Glutton of the Green
# Parse HTML into an object
soup = BeautifulSoup(html, "html.parser")
e = soup.select_one("div#main table tr td span")
# Handle superscripts with commas/semicolons in them - just split them into multiple superscripts
for tag in e.find_all("sup"):
if not re.search(r'[;,] ', tag.get_text()) is None:
for s in re.split(r'[;,] ', tag.get_text()):
if s.strip() == "": # Eliminate empty superscripts, mostly for Dagon
continue
newTag = soup.new_tag('sup')
newTag.string = s
tag.insert_after(newTag)
tag.extract()
# Prepare the HTML for iteration
e = e.contents
e.append(soup.new_tag("custom_end")) # Append a special end tag so we don't fall off the end
i = 0
# Helper function to skip br tags
def skipBr(optional=False):
nonlocal e, i
if not optional:
assert e[i].name == "br", url
elif e[i].name != "br":
return
i += 1
# Skip phantom spaces that sometimes show up after a br, e.g. Young Occult Dragon
if isinstance(e[i], NavigableString) and e[i].strip() == "":
i += 1
# Helper function to split on separator while avoiding splitting on commas inside parens
def splitP(s, handleAnd=False, sep=r', '):
o = re.split(sep + r'(?![^()]*\)|[^\[\]]*\])', s)
if handleAnd and o[-1].strip().startswith("and "):
o[-1] = o[-1].strip()[4:]
return o
# Helper function to collect all following text, handling unpredictable nodes
# Doesn't stop until it hits a node on the tags list
# Will advance nodes
# Use the special "[text]" value for text nodes
def collectText(tags, skip=["sup"], mark=[]):
nonlocal e, i
text, advance = collectTextRec(e[i:], tags, skip, mark)
i += advance
return text
def collectTextRec(e, tags, skip, mark):
text = ""
i = 0
while i < len(e):
if ("[text]" in tags and isinstance(e[i], NavigableString)) or (not isinstance(e[i], NavigableString) and e[i].name in tags):
break
if ("[text]" in skip and isinstance(e[i], NavigableString)) or (not isinstance(e[i], NavigableString) and e[i].name in skip):
i += 1
continue
if isinstance(e[i], NavigableString):
s = e[i]
elif e[i].name == "br":
s = "\n"
elif len(e[i].contents) > 0:
s, _ = collectTextRec(e[i].contents, tags, skip, mark)
else:
s = e[i].get_text()
# Mark if requested
if ("[text]" in mark and isinstance(e[i], NavigableString)) or (not isinstance(e[i], NavigableString) and e[i].name in mark):
tagName = "[text]" if isinstance(e[i], NavigableString) else e[i].name
s = "<" + tagName + ">" + s + "</" + tagName + ">"
text += s
i += 1
return text, i
# Helper to unwrap parens
def unwrapParens(s):
if s.startswith("(") and s.endswith(")"):
s = s[1:-1]
return s.strip()
# Helper to strip string and trailing char
def cleanS(s, trailingChar=";"):
s = s.strip()
if s[-1] == trailingChar:
s = s[:-1]
s = s.strip()
return s
# Helper to remove asterisks
asterisk_options = ["**", "*", "†"] # Should put things like ** before * for regex matching and such
def handleAsterisk(s):
return re.sub(r'(?:' + r'|'.join(re.escape(x) for x in asterisk_options) + r')', '', s).strip()
# Helper to update nested dicts
def updateNestedDict(d1, d2):
for k in d2:
if not k in d1:
d1[k] = d2[k]
else:
d1[k].update(d2[k])
# Sweep for asterisk lines, then return back to the beginning to start the real parse
while i < len(e) - 1:
if not isinstance(e[i], NavigableString) and e[i].name == "br":
if isinstance(e[i+1], NavigableString):
result = re.search(r'^(' + r'|'.join(re.escape(x) for x in asterisk_options) + r') ', str(e[i+1]).strip())
if result is None:
i += 1
continue
asterisk = result.group(1)
elif e[i+1].name == "sup" and e[i+1].get_text().strip() in asterisk_options:
asterisk = e[i+1].get_text().strip()
else:
i += 1
continue
e[i].extract() # Remove the br
# Collect the text and remove tags along the way
s = ""
while i < len(e) - 1:
if not isinstance(e[i], NavigableString) and e[i].name in ["br", "h1", "h2", "h3"]:
break
if isinstance(e[i], NavigableString):
s += e[i]
else:
s += e[i].get_text()
e[i].extract()
s = s.strip()
if not "asterisk" in pageObject:
pageObject["asterisk"] = {}
result = re.search(r'^' + re.escape(asterisk) + r'\s*(.+)$', s)
assert not result is None, url + " |" + s + "|"
assert not result.group(1) in pageObject["asterisk"], url + " |" + s + "|"
pageObject["asterisk"][asterisk] = result.group(1)
else:
i += 1
i = 0
# Skip preamble sections, like in Mythic Nalfeshnee
while not (e[i].name == "h1" and ((e[i+1].name == "i" and e[i+2].name == "h2") or e[i+1].name == "h2")):
i += 1
# Pre-sweep to find pages with multiple statblocks
titleCount = 0
i2 = i
isTestingBlock = False
while i2 < len(e) - 1:
if not isinstance(e[i2], NavigableString):
if isTestingBlock:
if e[i2].name == "h3" and e[i2].get('class') == ['framing'] and e[i2].get_text() == "Defense":
titleCount += 1
isTestingBlock = False
elif e[i2].name in ["h1", "h2", "h3"]:
isTestingBlock = False
else:
if e[i2].name == "h2" and e[i2].get('class') == ['title']:
isTestingBlock = True
i2 += 1
if titleCount > 1:
pageObject["second_statblock"] = True
# Get main title
assert e[i].name == "h1" and e[i]['class'] == ['title'], url
pageObject["title1"] = e[i].get_text()
if e[i].find("img", src="images\\ThreeFiveSymbol.gif") is not None:
pageObject["is_3.5"] = True
i += 1
# Get short description if present
if e[i].name == "i":
pageObject["desc_short"] = e[i].get_text()
i += 1
# Get statblock title & CR
assert e[i].name == "h2" and e[i]['class'] == ['title'], url
result = re.search(r'^(.+) CR ([0-9/-]+?)(?:/MR (\d+))?$', e[i].get_text())
assert not result is None, "CR-finding Regex failed for " + url
pageObject["title2"] = result.group(1)
pageObject["CR"] = float(Fraction(result.group(2))) if "/" in result.group(2) else parseInt(result.group(2), stringIfFail=True)
if pageObject["CR"] == "-":
pageObject["CR"] = None
if not result.group(3) is None:
pageObject["MR"] = parseInt(result.group(3))
i += 1
# Get sources
assert e[i].name == "b" and e[i].get_text() == "Source", url
i += 1
assert isinstance(e[i], NavigableString), url
i += 1
pageObject["sources"] = []
while e[i].name == "a":
s = e[i].get_text()
result = re.search(r'^(.+?) pg\. (\d+)', s)
assert not result is None, url + " |" + s + "|"
pageObject["sources"].append({
"name": result.group(1).strip(),
"page": parseInt(result.group(2)),
"link": e[i]["href"].strip()
}) # Strip weird whitespace in some entries (e.g. Vermlek)
i += 1
if isinstance(e[i], NavigableString): # Skip comma text node
i += 1
assert len(pageObject["sources"]) > 0, url
skipBr()
# Get XP if present (might be blank if there is none, such as with a Butterfly/Moth)
if e[i].name == "b" and e[i].get_text() == "XP":
i += 1
assert isinstance(e[i], NavigableString), url
s = handleAsterisk(e[i].strip())
if s == "":
pageObject["XP"] = None
else:
pageObject["XP"] = parseInt(s)
i += 1
skipBr()
# Special case: Nupperibo. Has a nonstandard line to cite a 3rd-party source for the official material
if pageObject["title2"] == "Nupperibo":
s = collectText(["br"]).strip()
result = re.search(r"^(.+?) (\d+)$", s)
assert not result is None, s
pageObject["sources"].append({
"name": result.group(1).strip(),
"page": parseInt(result.group(2)),
"link": None
})
skipBr()
# Get race and class levels if present
s = collectText(["br"]).strip()
skipBr()
# Special case: The Moldering Emperor has a nonstandard line with just a race
if pageObject["title2"] == "The Moldering Emperor":
pageObject["race_class"] = {"raw": s, "race": s}
s = collectText(["br"])
skipBr()
elif isinstance(e[i], NavigableString): # If we're looking at a string instead of the bold "Init", then we have a race/class line
pageObject["race_class"] = {"raw": s}
# Special case: a few werecreatures e.g. Weremantis (Human Form) have a nonstandard " (augmented humanoid)" at the end
race_suffix = ""
if s.endswith(" (augmented humanoid)"):
race_suffix = " (augmented humanoid)"
s = s[:-len(race_suffix)]
# Handle and remove prefix(es)
prefixes = ["male", "female", "advanced", "unique", "variant", "young", "adult", "middle-aged", "old", "venerable"]
while True:
for p in sorted(prefixes, key=len, reverse=True):
if s.lower().startswith(p + " "):
if not "prefix" in pageObject["race_class"]:
pageObject["race_class"]["prefix"] = []
pageObject["race_class"]["prefix"].append(p)
s = s[len(p)+1:]
break
else:
break
# Sometimes the "variant" tag is not at the beginning, e.g. Toy Golem
if "variant" in s.lower():
if not "prefix" in pageObject["race_class"]:
pageObject["race_class"]["prefix"] = ["variant"]
elif not "variant" in pageObject["race_class"]["prefix"]:
pageObject["race_class"]["prefix"].append("variant")
# Handle ending parenthetical if present
result = re.search(r'^(.+?) \(([^)]+?)\)$', s)
if not result is None:
s = result.group(1).strip()
sources = result.group(2).split(", ")
pageObject["race_class"]["sources"] = []
for source in sources:
# Special case: comma-separated page numbers (e.g. Ghristah)
if source.isdigit():
assert len(pageObject["race_class"]["sources"]) > 0, url
pageObject["race_class"]["sources"].append({"name": pageObject["race_class"]["sources"][-1]["name"],
"page": parseInt(source)})
continue
result = re.search(fr"^(.+?) (\d+)$", source)
assert not result is None, f'Invalid race/class source block "{source}" in {url}'
pageObject["race_class"]["sources"].append({"name": result.group(1).strip(),
"page": parseInt(result.group(2))})
# Special case: "see page" (e.g. Centaur Charger) - refers to the same source as the monster statblock
if pageObject["race_class"]["sources"][-1]["name"] == "see page":
assert len(pageObject["sources"]) == 1, url
pageObject["race_class"]["sources"][-1]["name"] = pageObject["sources"][0]["name"]
# Handle classes if present
if s[-1].isdigit():
class_reg = "(?:" + "|".join(sorted((re.escape(n.lower()) for n in class_hds), key=len, reverse=True)) + ")"
class_reg_block = fr"{class_reg} (?:of [\w' -]+ )?(?:\([^)]+?\) )?\d+"
result = re.search(fr"(?:^|\s+){class_reg_block}(?:/{class_reg_block})*$", s, re.IGNORECASE)
assert not result is None, f'Class regex failed on "{s}" in {url}'
classes = s[result.start():].strip().split("/")
s = s[:result.start()].strip()
pageObject["race_class"]["class"] = []
for c in classes:
result = re.search(fr"^({class_reg}) (?:of ([\w' -]+) )?(?:\(([^)]+?)\) )?(\d+)$", c, re.IGNORECASE)
assert not result is None, f'Invalid class block "{c}" in {url}'
pageObject["race_class"]["class"].append({"name": result.group(1).strip(),
"level": parseInt(result.group(4))})
if not result.group(2) is None:
assert pageObject["race_class"]["class"][-1]["name"] in ["antipaladin", "cleric", "druid", "inquisitor", "paladin", "warpriest"], f"Deity ({result.group(2)}) found in illegal class ({pageObject['race_class']['class'][-1]['name']}) in {url}"
pageObject["race_class"]["class"][-1]["deity"] = result.group(2).strip()
if not result.group(3) is None:
pageObject["race_class"]["class"][-1]["archetype"] = result.group(3).strip()
# Handle race + misc. if anything remains
if len(s) > 0:
s = s[0].upper() + s[1:] # Capitalize first letter (since we chop off prefixes sometimes)
pageObject["race_class"]["race"] = s + race_suffix # Add back in the " (augmented humanoid)" if it was present in the special case above
# Fetch the actual alignment line this time
s = collectText(["br"])
skipBr()
# Special case: Ugash-Iram has a unique race-only line after after alignment line
elif e[i].name == "a" and pageObject["title2"] == "Ugash-Iram":
s2 = collectText(["br"])
pageObject["race_class"] = {"raw": s2}
p = "Unique"
assert s2.startswith(p), url
pageObject["race_class"]["prefix"] = [p]
s2 = s2.replace(p, "").strip()
pageObject["race_class"]["race"] = s2[0].upper() + s2[1:]
skipBr()
# Get alignment, size, type, subtypes
result = re.search(r'^(.+) (' + "|".join(sizes) + r') ([^(]+)(?: \((.+)\))?$', handleAsterisk(s))
assert not result is None, "Alignment Line Regex failed for " + url + " |" + handleAsterisk(s) + "|"
pageObject["alignment"] = {"raw": result.group(1), "cleaned": result.group(1).replace("Always ", "")}
pageObject["size"] = result.group(2)
pageObject["type"] = result.group(3)
if result.group(4) is not None:
pageObject["subtypes"] = splitP(result.group(4))
# Get initiative
assert e[i].name == "b" and e[i].get_text() == "Init", url
i += 1
assert isinstance(e[i], NavigableString), url
s = collectText("b").strip()
result = re.search(r'^([+-]\s*\d+)(?:/([+-]\s*\d+))?\s*(?:\(([+-]\s*\d+)\s+(.+?)\))?\s*(?:[,;]\s*(.+?)\s*)?;$', s)
assert not result is None, "Initiative Regex failed for " + url + " |" + s + "|"
if not result.group(2) is None: # Check for dual initiative
pageObject["initiative"] = {"bonus": [parseInt(result.group(1)), parseInt(result.group(2))]}
else:
pageObject["initiative"] = {"bonus": parseInt(result.group(1))}
if not result.group(3) is None: # Different initiative modifier in some instances (e.g. Formian Taskmaster, "(+6 with hive mind)")
pageObject["initiative"]["other"] = {result.group(4): parseInt(result.group(3))}
if not result.group(5) is None: # Initiative abilities
pageObject["initiative"]["ability"] = result.group(5)
# Get senses
assert e[i].name == "b" and e[i].get_text() == "Senses", url
i += 1
s = collectText(["h3", "br"]).strip()
if not "is_3.5" in pageObject:
result = re.search(r'^(?:(.+)[;,])?\s*(Perception\s+[+-]\s*\d+.*?)$', s) # Regex handles broken formatting on pages like Demonologist that use a comma instead of a semicolon. Space before Perception is variable length because of the typos in Elder Air Elemental and Scarlet Walker, and space inside number because of Mirror Serpent
assert not result is None, "Senses Regex failed for " + url
perceptionSkill = result.group(2) # Save perception skill to combine with skills section later
else:
result = re.search(r'^(?:(.+)[;,])?\s*(Listen\s+[+-]\s*\d+.*?),\s*(Spot\s+[+-]\s*\d+.*?)$', s)
assert not result is None, "Senses Regex failed for " + url
listenSkill = result.group(2)
spotSkill = result.group(3)
if result.group(1) is not None:
entries = splitP(result.group(1), sep=r'[,;]')
pageObject["senses"] = {}
for entry in entries:
entry = handleAsterisk(entry.strip())
result = re.search(r'^(.+?)\s+(\d+)\s*ft\s*\.?\s*(?:\((.+?)\))?$', entry)
if not result is None:
pageObject["senses"][result.group(1).lower()] = parseInt(result.group(2))
if not result.group(3) is None:
pageObject["senses"][result.group(1).lower() + "_other"] = result.group(3).strip()
else:
pageObject["senses"][entry.lower()] = True
skipBr(optional=True)
# Get auras if present
if e[i].name == "b" and e[i].get_text() == "Aura":
i += 1
pageObject["auras"] = []
for aura in splitP(collectText(["h3", "br"]).strip()):
aura_dict = {}
result = re.search(r'^(.+?)(?:\s+\((.+?)\))?$', aura)
assert not result is None, "Aura Regex failed for " + url
aura_dict['name'] = handleAsterisk(result.group(1).strip())
if not result.group(2) is None:
parts = splitP(result.group(2), sep=r'[,;]')
for part in parts:
part = part.strip()
result = re.search(r'^(\d+)[ -](?:ft\.?|feet)(?: radius)?$', part)
if not result is None:
aura_dict['radius'] = parseInt(result.group(1), stringIfFail=True)
continue
result = re.search(r'^DC (\d+)(?: (Fort|Ref|Will))?$|^(Fort|Ref|Will) DC (\d+) negates$', part)
if not result is None:
if not result.group(1) is None:
aura_dict['DC'] = parseInt(result.group(1))
if not result.group(2) is None:
aura_dict['DC_type'] = result.group(2)
else:
aura_dict['DC'] = parseInt(result.group(4))
aura_dict['DC_type'] = result.group(3)
continue
result = re.search(r'^\d+(?:d\d+)? (?:round|minute|hour|day)s?$', part)
if not result is None:
aura_dict['duration'] = part
continue
if not 'other' in aura_dict:
aura_dict['other'] = []
aura_dict['other'].append(part)
pageObject['auras'].append(aura_dict)
# DEFENSE
assert e[i].name == "h3" and e[i]['class'] == ['framing'] and e[i].get_text() == "Defense", url
i += 1
# Get AC
assert e[i].name == "b" and e[i].get_text() == "AC", url
i += 1
s = collectText(["br"]).strip()
result = re.search(r'^(-?\d+)[,;]\s+touch\s+([-+]?\d+)[,;]\s+flat-?footed\s+([-+]?\d+)(?:\s*;?\s*\((.+?)\))?(?:;?\s*(.+))?\.?$', s) # Accepts ; as well as , because of broken formatting on pages like Bugbear Lurker. Skip broken formatting trailing period in e.g. Flying Fox
assert not result is None, "AC Regex failed for " + url
pageObject["AC"] = {
"AC": parseInt(result.group(1)),
"touch": parseInt(result.group(2)),
"flat_footed": parseInt(result.group(3))
}
if not result.group(5) is None:
pageObject["AC"]["other"] = unwrapParens(result.group(5).strip())
if not result.group(4) is None:
entries = splitP(result.group(4), sep=r'[,;] ')
pageObject["AC"]["components"] = {}
for entry in entries:
entry = entry.strip() # Fixes whitespace issues in e.g. Malsandra (probably caused by \r handling)
result = re.search(r'^([+-]\d+)\s+(.+)$', entry)
if not result is None:
pageObject["AC"]["components"][result.group(2).lower().strip()] = parseInt(result.group(1))
else:
if not "other" in pageObject["AC"]["components"]:
pageObject["AC"]["components"]["other"] = []
pageObject["AC"]["components"]["other"].append(entry)
skipBr()
# Get HP, and fast healing / regeneration / other HP abilities if present
assert e[i].name == "b" and e[i].get_text() == "hp", url
i += 1
assert isinstance(e[i], NavigableString), url
s = handleAsterisk(e[i].strip())
result = re.search(r'^(\d+)(?:\s+each)?\s*\((.+?)(?: plus (.+?))?\)(?:[;,] (.+))?$', s) # Supports , instead of ; for broken formatting on pages like Egregore
assert not result is None, "HP Regex failed for " + url
pageObject["HP"] = {
"total": parseInt(result.group(1)),
"long": result.group(2)
}
if not result.group(3) is None:
pageObject["HP"]["plus"] = result.group(3).strip()
if not result.group(4) is None:
result2 = re.search(r'^(fast healing|regeneration)\s+(\d+)(?:\s*\((.+?)\))?$', result.group(4).strip())
if not result2 is None:
pageObject["HP"][result2.group(1).replace(" ", "_")] = parseInt(result2.group(2))
if not result2.group(3) is None:
pageObject["HP"][result2.group(1).replace(" ", "_") + "_weakness"] = result2.group(3).strip()
else:
pageObject["HP"]["other"] = result.group(4)
# Parse parenthetical
pageObject["HP"]["HD"] = {}
# Special case: Shifty Noble is missing the bonus HP after the trailing +
s = pageObject["HP"]["long"]
if pageObject["title2"] == "Shifty Noble":
s = s[:-1]
result = re.search(r'^(?:(\d+) HD[;,] )?((?:\d+d\d+\+)+)?(\d+d\d+)(?:\+?([+-]\d+))?(?: HD)?$', s) # Supports double plus for Jiang-Shi typo, comma instead of semicolon for Villager (Farmer), and trailing " HD" for Swamp Mummy
assert not result is None, f'HP parenthetical regex failed for "{s}" in {url}'
pageObject["HP"]["bonus_HP"] = parseInt(result.group(4)) if not result.group(4) is None else 0
HD_blocks = [result.group(3)]
if result.group(2) is not None:
HD_blocks = result.group(2).split("+")[:-1] + HD_blocks # Drop the last because of trailing +
HD_total = None if result.group(1) is None else parseInt(result.group(1))
HD_blocks_map = {}
for HD_block in HD_blocks:
result2 = re.search(r'^(\d+)d(\d+)', HD_block)
assert not result2 is None, f"Broken HD block '{HD_block}' in {url}"
die = parseInt(result2.group(2))
if not die in HD_blocks_map: # Sometimes same-size HDs from different sources are combined (Eye of Lamashtu) and sometimes not (Centaur Charger)
HD_blocks_map[die] = 0
HD_blocks_map[die] += parseInt(result2.group(1))
# Handle HD blocks
if not ("race_class" in pageObject and "class" in pageObject["race_class"]):
assert len(HD_blocks_map) == 1, f"Class HDs but no class levels in {url}"
else:
pageObject["HP"]["HD"]["class"] = []
for x in pageObject["race_class"]["class"]: # Which HD is racial is inconsistent, so we do process of elimination to figure out what it is
die = class_hds[classname_map[x["name"].lower()]]
if die == 0: # Mythic paths give no HDs, so they are encoded as hit dice 0 in our data parse
x["mythic_path"] = True
continue
if pageObject["title2"] == "Osirion Mummy": # Special case: Osirion Mummy trades the normal class HDs for d12s
die = 12
if pageObject["title2"] == "Night Scale Assassin" and x["name"] == "assassin": # Special case: Night Scale Assassin makes the assassin HD a d4 for some reason
die = 4
HD_blocks_map[die] -= x["level"]
if HD_blocks_map[die] == 0:
del HD_blocks_map[die]
pageObject["HP"]["HD"]["class"].append({"name": x["name"],
"die": die,
"num": x["level"]})
assert len(HD_blocks_map) <= 1, f"Class HDs don't match classes in {url}"
if len(HD_blocks_map) == 1: # There are racial HDs left over
pair = list(HD_blocks_map.items())[0]
pageObject["HP"]["HD"]["racial"] = {"die": pair[0], "num": pair[1]}
pageObject["HP"]["HD"]["num"] = 0
if "racial" in pageObject["HP"]["HD"]:
pageObject["HP"]["HD"]["num"] += pageObject["HP"]["HD"]["racial"]["num"]
if "class" in pageObject["HP"]["HD"]:
pageObject["HP"]["HD"]["num"] += sum(x["num"] for x in pageObject["HP"]["HD"]["class"])
assert HD_total is None or HD_total == pageObject["HP"]["HD"]["num"], f"Mismatched HD count in {url}"
i += 1
skipBr()
# Get saves
pageObject["saves"] = {}
for save in ["Fort", "Ref", "Will"]:
assert e[i].name == "b" and e[i].get_text() == save, url
i += 1
assert isinstance(e[i], NavigableString), url
s = cleanS(e[i], trailingChar=',')
i += 1
result = re.search(r'^([+-]?\s*\d+)\s*(?:\((.+?)\))?\s*(?:;\s+)?(.+?)?$', s)
assert not result is None, save + " Save Regex failed for " + url + "\tInput: |" + s + "|"
pageObject["saves"][save.lower()] = parseInt(result.group(1))
if not result.group(2) is None:
pageObject["saves"][save.lower() + "_other"] = result.group(2).strip()
# On the last save (Will) check for a post-save semicolon covering misc. bonuses that apply to every save type
if not save == "Will":
assert result.group(3) is None, url
elif not result.group(3) is None:
pageObject["saves"]["other"] = result.group(3).strip()
skipBr(optional=True)
# Get defensive abilities if present
if e[i].name == "b" and e[i].get_text() == "Defensive Abilities":
i += 1
pageObject["defensive_abilities"] = splitP(handleAsterisk(cleanS(collectText(["b", "br", "h3"]))))
# Get DR if present
if e[i].name == "b" and e[i].get_text() == "DR":
i += 1
s = cleanS(collectText(["b", "br", "h3"]))
entries = splitP(s, sep=r'(?:,|\s+and)(?:\s+DR)?\s+(?=\d+/)')
pageObject["DR"] = []
for entry in entries:
entry = entry.strip()
entrydict = {}
result = re.search(r'^(\d+)/\s*(.+?)\s*(?:\((?:(?:(.+?), )?(\d+) (?:hp|hit points|points)|(.+?))?\))?$', entry)
assert not result is None, url + " |" + entry + "|"
entrydict["amount"] = parseInt(result.group(1))
entrydict["weakness"] = result.group(2)
if not result.group(4) is None:
entrydict["max_absorb"] = parseInt(result.group(4))
if not result.group(3) is None:
entrydict["other"] = result.group(3)
elif not result.group(5) is None:
entrydict["other"] = result.group(5)
pageObject["DR"].append(entrydict)
# Get immunities if present
if e[i].name == "b" and e[i].get_text() == "Immune":
i += 1
pageObject["immunities"] = splitP(cleanS(collectText(["h3", "br", "b"])).strip(), handleAnd=True)
# Get resistances if present
if e[i].name == "b" and e[i].get_text() == "Resist":
i += 1
s = cleanS(collectText(["h3", "br", "b"])) # collectText specifically for Arcanotheign
pageObject["resistances"] = {}
# Special case: First Blade, ability in the Resist section
result = re.search(r'^(.+); (.+)$', s)
if not result is None:
pageObject["resistances"]["_ability"] = result.group(2).strip()
s = result.group(1).strip()
# Special case: Queen of Staves, says "Resist 5 fire" instead of "Resist fire 5"
if pageObject["title2"] == "Queen of Staves":
s = "fire 5"
entries = splitP(s, sep=r'(?:,?\s+and\s+|,)')
for entry in entries:
entry = entry.strip() # Handles strange whitespace in cases like Black Magga (probably caused by \r handling)
result = re.search(r'^(.+?)\s+(\d+)(?:\s*\((.+?)\))?$', entry)
if result is None: # Custom resistances, e.g. The Whispering Tyrant
pageObject["resistances"][entry] = True
else:
pageObject["resistances"][result.group(1).lower()] = parseInt(result.group(2))
if not result.group(3) is None:
pageObject["resistances"][result.group(1).lower() + "_other"] = result.group(3).strip()
# Get SR if present
if e[i].name == "b" and e[i].get_text() == "SR":
i += 1
assert isinstance(e[i], NavigableString), url
pageObject["SR"] = parseInt(cleanS(e[i]), stringIfFail=True)
i += 1
skipBr(optional=True)
# Get weaknesses if present
if e[i].name == "b" and e[i].get_text() == "Weaknesses":
i += 1
pageObject["weaknesses"] = splitP(collectText(["h3"]).strip()) # Skip leading space
# OFFENSE
assert e[i].name == "h3" and e[i]['class'] == ['framing'] and e[i].get_text() == "Offense", url
i += 1
# Get speed
assert e[i].name == "b" and e[i].get_text() == "Speed", url
i += 1
s = collectText(["br"])
# Handle entries like Solar that have one set of speeds normally and another in armor
parts = re.split(r'; (?![^()]*\))', s) # Use a special split to avoid splitting on semicolons inside parens
assert len(parts) <= 2, url
s = parts[0]
entries = splitP(s)
pageObject["speeds"] = {}
for j, entry in enumerate(entries):
result = re.search(r'^\s*(?:(.+?)\s+)?(\d+)\s*ft\s*\.\s*(?:\((.+?)\))?$', entry.strip())
if not result is None:
t = result.group(1)
if j != 0:
assert t is not None, url
elif t is None:
t = "base"
t = t.lower()
pageObject["speeds"][t] = parseInt(result.group(2))
if result.group(3) is not None:
t2 = t + "_other"
v = result.group(3).strip()
if t == "fly" and v.lower() in ["clumsy", "poor", "average", "good", "perfect"]:
v = v.lower()
t2 = "fly_maneuverability"
pageObject["speeds"][t2] = v
else:
if not "other" in pageObject["speeds"]:
pageObject["speeds"]["other"] = []
pageObject["speeds"]["other"].append(entry.strip())
if len(parts) > 1:
pageObject["speeds"]["other_semicolon"] = parts[1].strip()
skipBr()
# Get melee and ranged attacks if present
pageObject["attacks"] = {}
for attack_type in ["Melee", "Ranged"]:
if e[i].name == "b" and e[i].get_text() == attack_type:
i += 1
s = handleAsterisk(collectText(["h3", "b"]).strip())
# Special case: no melee attack (currently only present in Lar)
if attack_type == "Melee" and s == "---":
continue
# Special case: Formless Spawn, trailing comma in melee attack
if attack_type == "Melee" and pageObject["title2"] == "Formless Spawn":
s = re.sub(r",$", "", s)
key = attack_type.lower()
pageObject["attacks"][key] = []
groups = splitP(s, sep=r'(?<=\))[;,]?\s+or\s+')
# Special case: Ghristah has an "or" we need to split on that we can't otherwise easily detect (can't do this everywhere because there are "or"s we shouldn't split on in some attacks)
if pageObject["title2"] == "Ghristah":
groups = splitP(s, sep=r'[;,]?\s+or\s+')
for group in groups:
entries = splitP(group.strip(), sep=r'(?:, ?| and )')
group_list = []
for entry in entries:
entry = entry.strip()
attack_dict = {"text": entry, "entries": []}
# First, process the body and separate the parenthetical
result = re.search(r'^(\d+(?:d\d+(?:[+-]\d+)?)?)?\s*(.*?)\s*((?:[+-]\d+/)*[+-]\d+)?(?:\s+(?:(?:melee|ranged|(incorporeal))\s+)?(touch)?(?: attack)?)?\s*(?:\(([^)]+)\)\s*(?:\(([^)]+)\)| plus (.+?))?)?$', entry)
assert not result is None, "Attack Regex failed for " + url + " |" + entry + "|"
if not result.group(1) is None:
attack_dict["count"] = parseInt(result.group(1), stringIfFail=True)
attack_dict["attack"] = result.group(2)
if attack_dict["attack"] == "":
if not result.group(5) is None:
attack_dict["attack"] = "touch"
elif len(group_list) > 0: # Special case: Marauder (Pirate Captain)
attack_dict["attack"] = group_list[len(group_list) - 1]["attack"]
if not result.group(3) is None:
attack_dict["bonus"] = [parseInt(x) for x in splitP(result.group(3), sep=r'/')]
if not result.group(5) is None:
attack_dict["touch"] = True
if not result.group(4) is None:
assert attack_dict["touch"], url
attack_dict["touch"] = "incorporeal"
p = result.group(6)
if not result.group(7) is None:
attack_dict["restriction"] = result.group(7).strip()
postPlus = result.group(8)
# Helper function to parse crit block since we do it in 2 places
def parseCritBlock(s, pure=False):
# Special case: crit block at the end after comma, happens in Deadfall Tracker and Devotee of the Ravener King
if pure:
result = re.search(r'^(.+?), (\d+ *- *\d+)$', s)
if not result is None:
return result.group(1).strip(), result.group(2), None, None
crit_range = None
crit_multiplier = None
post = None
regex = r'^(.*?)/(?:(\d+ *- *\d+)(?:/\s*[×x] *(\d))?|[×x] *(\d)) *'
if not pure:
regex += r'(?: (?!/)(.+?))?'
regex += r'$'
result = re.search(regex, s)
if not result is None:
crit_range = result.group(2)
crit_multiplier = result.group(3) if not result.group(3) is None else result.group(4)
if not crit_multiplier is None:
crit_multiplier = parseInt(crit_multiplier)
s = result.group(1).strip()
if not pure and not result.group(5) is None:
post = result.group(5).strip()
return (s, crit_range, crit_multiplier, post)
# Now, process the parenthetical
if not p is None:
p = p.strip()
# Special case: Death Worm Leviathan has "touch" inside parens
if pageObject["title2"] == "Death Worm Leviathan":
result = re.search(r'^touch (.+)$', p)
if not result is None:
p = result.group(1).strip()
attack_dict["touch"] = True
# Special case: Clockwork Assassin's attack might deal damage or might just be smoke
if pageObject["title2"] == "Clockwork Assassin":
result = re.search(r'^(.+?) \((.+?) or (.+?)\)$', entry)
if not result is None:
p = result.group(2).strip()
groups.append(result.group(1).strip() + " (" + result.group(3).strip() + ")")
# Handle rare syntax: a single crit block at the end after the pluses
# This also catches entries with no "plus" but with a crit block
p, common_crit_range, common_crit_multiplier, _ = parseCritBlock(p, pure=True)
separators = [r',?\s+plus\s+', r';\s+']
# Special case: a few creatures have an "and" we shouldn't split on
if not pageObject["title2"] in ["Anemos", "Heresy Devil (Ayngavhaul)", "Orynox Marchelin, Fire Giant King"]:
separators.append(r',?\s+and\s+')
# Special case: in Wereboar (Hybrid Form) we need to split on a slash
if pageObject["title2"] == "Wereboar (Hybrid Form)":
separators.append(r'/(?=\D)')
# Special case: we don't split on commas in "acid, cold, electricity, or fire damage" in Zhyen, and "push, 10 ft." in Panotti
if re.search(r', or |push, \d+ ft', p) is None:
separators.append(r',\s+')
pentries = splitP(p, sep=r'(?:' + r'|'.join(separators) + r')')
if not postPlus is None: # Special case: Arcanaton, plus outside parens
pentries.append(postPlus.strip())
attack_list = []
for pentry in pentries:
pentry = pentry.strip()
entrydict = {}
attack_list.append(entrydict)
# Check if this is a damage entry
result = re.search(r'^((?:\d+d\d+|[\d.]+)(?: *[+-] *(?:\d+d\d+|[\d.]+))*(?:/(?:\d+d\d+[+-]\d+))?(?: per .+?(?=/))?)(.*?)(?: *vs\. (.+?))?$', pentry) # Special cases in this regex: periods allowed in attack damage for Frost Giant Hunter, "per" syntax for Thorny, vs. see below, slash for Forge Rider and Criminal (Street Thug)
if not result is None:
entrydict["damage"] = result.group(1)
leftover = ""
if not result.group(2) is None:
leftover = result.group(2).strip()
if not result.group(3) is None: # Rare syntax: holy/unholy weapons that deal bonus damage against a certain alignment
entrydict["applies_against"] = result.group(3).strip()
# Handle crit block
damage_type, crit_range, crit_multiplier, post = parseCritBlock(leftover, pure=False)
damage_type = damage_type.strip()
if post != None: # For e.g. Anemos, "10d6+5/19-20 electricity"
assert damage_type == "", url + " |" + damage_type + "| |" + post + "|"
damage_type = post
if damage_type != "":
entrydict["type"] = damage_type.strip()
if not common_crit_range is None or not common_crit_multiplier is None: # Can't have both a pentry crit block and a common crit block
assert crit_range is None and crit_multiplier is None, url
crit_range, crit_multiplier = common_crit_range, common_crit_multiplier
if not crit_range is None:
entrydict["crit_range"] = crit_range.replace(" ", "") # Remove erroneous spaces
if not crit_multiplier is None:
entrydict["crit_multiplier"] = crit_multiplier
continue
# Rare syntax: reversed bleed damage, e.g. "bleed 1d4" in Wolpertinger
result = re.search(r'^bleed (\d+(?:d\d+)?)', pentry)
if not result is None:
entrydict["damage"] = result.group(1)
entrydict["type"] = "bleed"
continue
# Otherwise, just treat this as an attack effect and drop the whole text in unparsed
# We could do some DC parsing here, but the format is too inconsistent for it to be worth meddling with
# Plus, sometimes the DC is for the whole attack, and sometimes just for another pentry (which is hard to identify)
entrydict["effect"] = pentry
attack_dict["entries"].append(attack_list)
group_list.append(attack_dict)
pageObject["attacks"][key].append(group_list)
# Get space if present
if e[i].name == "b" and e[i].get_text() == "Space":
i += 1
assert isinstance(e[i], NavigableString), url
result = re.search(r'^(?:(\d+)|(2\s*-?\s*1/2)|(1/2))\s*(?:ft\.?|feet)$', cleanS(e[i], ",").strip())
assert not result is None, "Space Regex failed for " + url
if not result.group(2) is None:
pageObject["space"] = 2.5
elif not result.group(3) is None:
pageObject["space"] = 0.5
else:
pageObject["space"] = parseInt(result.group(1))
i += 1
# Get reach if present
if e[i].name == "b" and e[i].get_text() == "Reach":
i += 1
assert isinstance(e[i], NavigableString), url
result = re.search(r'^(?:(\d+)|(2\s*-?\s*1/2)|(1/2))\s*(?:ft\.?|feet)(?:\s*\(?([^)]+)\)?)?$', cleanS(e[i], ",").strip())
assert not result is None, "Reach Regex failed for " + url
if not result.group(2) is None:
pageObject["reach"] = 2.5
elif not result.group(3) is None:
pageObject["reach"] = 0.5
else:
pageObject["reach"] = parseInt(result.group(1))
if not result.group(4) is None:
pageObject["reach_other"] = result.group(4).strip()
i += 1
# Skip br if present
skipBr(optional=True)
# Get special attacks if present
if e[i].name == "b" and e[i].get_text() == "Special Attacks":
i += 1
pageObject["attacks"]["special"] = [x.strip() for x in splitP(handleAsterisk(collectText(["h3", "br"]).strip()))]
skipBr(optional=True)
# Handle all spell-related blocks, including spells, spell-like abilities, and more
while True:
if e[i].name == "b" and ("Spells" in e[i].get_text() or "Extracts" in e[i].get_text()):
key = "spells"
result = re.search(r'^(?:([\w ]+) )?(?:Spells|Extracts) (Prepared|Known)$', e[i].get_text().strip())
assert not result is None, "Spell Class Regex failed for " + url
source = result.group(1)
spell_type = result.group(2).lower()
if source is None:
# If no class was listed, but we have only one class, use that one
if "classes" in pageObject and pageObject["classes"] is not None and len(pageObject["classes"]) == 1:
result = re.search(r'^(.+?)\s+\d+$', pageObject["classes"][0].strip())
if not result is None:
source = result.group(1).title()
else: # No idea. e.g. Noble (Knight), where the spells are Paladin spells, but he also has the Aristocrat class
source = "?"
elif e[i].name == "b" and e[i].get_text().strip().endswith("Spell-Like Abilities"):
key = "spell_like_abilities"
source = e[i].get_text().replace("Spell-Like Abilities", "").strip().lower() # Get type of spell-like ability (notably "Domain")
if source == "":
source = "default"
elif e[i].name == "b" and e[i].get_text().strip() == "Kineticist Wild Talents Known":
key = "kineticist_wild_talents"
elif e[i].name == "b" and (e[i].get_text().strip() == "Psychic Magic" or e[i].get_text().strip() == "Psychic Magic (Sp)"):
key = "psychic_magic"
source = "default"
else:
break
i += 1
# Handle spell-related header
if key != "kineticist_wild_talents":
# Init the first time we encounter a key of a given type (since we may encounter e.g. multiple spell blocks)
if not key in pageObject:
pageObject[key] = {"entries": []}
# Init the first time
if not "sources" in pageObject[key]:
pageObject[key]["sources"] = []
sourcedict = {"name": source}
if key == "spells":
sourcedict["type"] = spell_type
assert isinstance(e[i], NavigableString), url
result = re.search(r'^\((.+)\)$', e[i].strip())
assert not result is None, f'Spell-Related Header Base Regex failed for "{e[i].strip()}" in {url}'
entries = splitP(result.group(1).strip(), sep=r'[;,]') # Handles corrupted formatting for , instead of ; like in Ice Mage
i += 1
# The CL should always be there
result = re.search(r'^(?:CL|caster level)\s+(\d+)(?:\w{2})?$', entries.pop(0), re.IGNORECASE) # Ignore case for Nochlean
assert not result is None, "Spell-Related Header CL Regex failed for " + url
sourcedict["CL"] = parseInt(result.group(1))
# Optional entries
for entry in entries:
entry = entry.strip()
# Concentration
result = re.search(r'^conc(?:entration|\.):?\s+([+-]\d+)$', entry, re.IGNORECASE) # Concentration colon for Executioner Devil (Munagola)
if not result is None: