-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprecompute_types.py
More file actions
175 lines (153 loc) · 7.63 KB
/
precompute_types.py
File metadata and controls
175 lines (153 loc) · 7.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python3
"""
Pre-compute analysis of element types, classification types, and property types
"""
import json
import sys
from collections import defaultdict
def analyze_types(data):
"""Analyze element types, classification types, and property types"""
element_types = defaultdict(int)
classification_types = defaultdict(int)
classification_depths = defaultdict(list) # Track depths for each classification
classification_parents = defaultdict(set) # Track unique parents for each classification
property_keys = defaultdict(int)
property_concepts = defaultdict(int)
stewards = defaultdict(int)
# Track which items use which types
element_type_to_items = defaultdict(set)
classification_to_items = defaultdict(set)
property_key_to_items = defaultdict(set)
property_concept_to_items = defaultdict(set)
steward_to_items = defaultdict(set)
for i, item in enumerate(data):
tiny_id = item.get('tinyId', f'item_{i}')
# 1. Element Types
element_type = item.get('elementType', 'Unknown')
if element_type:
element_types[element_type] += 1
element_type_to_items[element_type].add(tiny_id)
# 2. Classification Types (names from classification[].elements[].name)
# Track depth: level 0 is the first element, level 1 is nested, etc.
# Track parents: for nested elements, track their parent classification
if item.get('classification'):
for cls in item['classification']:
if cls.get('elements'):
def process_elements(elements, depth=0, parent_name=None):
for elem in elements:
if elem.get('name'):
classification_name = elem['name']
classification_types[classification_name] += 1
classification_depths[classification_name].append(depth)
classification_to_items[classification_name].add(tiny_id)
# Track parent if this is not at level 0
if depth > 0 and parent_name:
classification_parents[classification_name].add(parent_name)
# Handle nested elements recursively
if elem.get('elements'):
current_name = elem.get('name') if elem.get('name') else parent_name
process_elements(elem['elements'], depth + 1, parent_name=current_name)
process_elements(cls['elements'], depth=0, parent_name=None)
# 3. Property Keys (from properties[].key)
if item.get('properties'):
for prop in item['properties']:
prop_key = prop.get('key', 'Unknown')
if prop_key:
property_keys[prop_key] += 1
property_key_to_items[prop_key].add(tiny_id)
# 4. Property Concepts (from property.concepts[].name)
if item.get('property', {}).get('concepts'):
for concept in item['property']['concepts']:
concept_name = concept.get('name', 'Unknown')
if concept_name:
property_concepts[concept_name] += 1
property_concept_to_items[concept_name].add(tiny_id)
# 5. Stewards (from stewardOrg.name)
steward_name = item.get('stewardOrg', {}).get('name', 'Unknown')
if steward_name:
stewards[steward_name] += 1
steward_to_items[steward_name].add(tiny_id)
# Convert sets to counts
element_type_to_items_counts = {
k: len(v) for k, v in element_type_to_items.items()
}
classification_to_items_counts = {
k: len(v) for k, v in classification_to_items.items()
}
property_key_to_items_counts = {
k: len(v) for k, v in property_key_to_items.items()
}
property_concept_to_items_counts = {
k: len(v) for k, v in property_concept_to_items.items()
}
steward_to_items_counts = {
k: len(v) for k, v in steward_to_items.items()
}
# Calculate depth statistics for each classification
classification_depth_stats = {}
for cls_name, depths in classification_depths.items():
if depths:
parent_set = classification_parents.get(cls_name, set())
parent_count = len(parent_set)
# Store parent names as a sorted list (limit to 50 for display)
parent_names = sorted(list(parent_set))[:50]
classification_depth_stats[cls_name] = {
'min_depth': min(depths),
'max_depth': max(depths),
'avg_depth': sum(depths) / len(depths),
'depth_distribution': {d: depths.count(d) for d in set(depths)},
'parent_count': parent_count,
'parent_names': parent_names,
'has_more_parents': parent_count > 50
}
result = {
'total_items': len(data),
'element_types': dict(sorted(element_types.items(), key=lambda x: x[1], reverse=True)),
'classification_types': dict(sorted(classification_types.items(), key=lambda x: x[1], reverse=True)),
'classification_depth_stats': classification_depth_stats,
'property_keys': dict(sorted(property_keys.items(), key=lambda x: x[1], reverse=True)),
'property_concepts': dict(sorted(property_concepts.items(), key=lambda x: x[1], reverse=True)),
'stewards': dict(sorted(stewards.items(), key=lambda x: x[1], reverse=True)),
'element_type_to_items': element_type_to_items_counts,
'classification_to_items': classification_to_items_counts,
'property_key_to_items': property_key_to_items_counts,
'property_concept_to_items': property_concept_to_items_counts,
'steward_to_items': steward_to_items_counts
}
return result
def main():
print("Loading SearchExport.json...")
try:
with open('SearchExport.json', 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print("Error: SearchExport.json not found")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
sys.exit(1)
print(f"Analyzing {len(data)} items...")
result = analyze_types(data)
output_file = 'types_analysis_data.json'
print(f"Writing results to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print("Analysis complete!")
print(f"\nSummary:")
print(f" Total items: {result['total_items']}")
print(f" Unique element types: {len(result['element_types'])}")
print(f" Unique classification types: {len(result['classification_types'])}")
print(f" Unique property keys: {len(result['property_keys'])}")
print(f" Unique property concepts: {len(result['property_concepts'])}")
print(f" Unique stewards: {len(result['stewards'])}")
print(f"\nTop element types:")
for elem_type, count in list(result['element_types'].items())[:5]:
print(f" - {elem_type}: {count}")
print(f"\nTop classification types:")
for cls_type, count in list(result['classification_types'].items())[:5]:
print(f" - {cls_type}: {count}")
print(f"\nTop property keys:")
for prop_key, count in list(result['property_keys'].items())[:5]:
print(f" - {prop_key}: {count}")
if __name__ == '__main__':
main()