diff --git a/parser/parser.go b/parser/parser.go index 1dd2241..b79e3b5 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -78,6 +78,9 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { parse(path) } + // Ensure professor section references remain consistent after parsing section data. + syncProfessorSectionLinks() + log.Printf("\nParsing complete. Created %d courses, %d sections, and %d professors.", len(Courses), len(Sections), len(Professors)) log.Print("\nParsing course requisites...") diff --git a/parser/professorParser.go b/parser/professorParser.go index 40eb3e5..2fbc99e 100644 --- a/parser/professorParser.go +++ b/parser/professorParser.go @@ -1,6 +1,8 @@ package parser import ( + "fmt" + "regexp" "strings" "github.com/PuerkitoBio/goquery" @@ -10,6 +12,73 @@ import ( "go.mongodb.org/mongo-driver/bson/primitive" ) +var ( + apiPrimaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-Za-z]?)$`) + apiFallbackLocationRegex = regexp.MustCompile(`^([A-Za-z]+)(\d+)\.?([\d]{3}[A-Za-z]?)$`) +) + +func newProfessor(firstName, lastName string) *schema.Professor { + prof := &schema.Professor{} + prof.Id = primitive.NewObjectID() + prof.First_name = firstName + prof.Last_name = lastName + return prof +} + +func buildProfessorFromRow(row profileIndexRow) *schema.Professor { + firstName := strings.TrimSpace(row.FirstName) + lastName := strings.TrimSpace(row.LastName) + if firstName == "" || lastName == "" { + firstName, lastName = splitFullName(row.FullName) + } + + // Ignore blank names to match the parser's existing professor population behavior. + if firstName == "" || lastName == "" { + return nil + } + + prof := newProfessor(firstName, lastName) + applyProfileFields(prof, row) + + return prof +} + +func applyProfileFields(prof *schema.Professor, row profileIndexRow) { + titles := collectTitles(row) + info := bestInformationData(row.Information) + if prof.Titles == nil { + prof.Titles = []string{} + } + + for _, title := range titles { + if !containsString(prof.Titles, title) { + prof.Titles = append(prof.Titles, title) + } + } + + if prof.Email == "" { + prof.Email = trimNullableString(info.Email) + } + if prof.Phone_number == "" { + prof.Phone_number = trimNullableString(info.Phone) + } + if prof.Office.Building == "" && prof.Office.Room == "" && prof.Office.Map_uri == "" { + prof.Office = bestLocation(row.Information) + } + if prof.Profile_uri == "" { + prof.Profile_uri = bestProfileURI(row) + } + if prof.Image_uri == "" { + prof.Image_uri = bestImageURI(row) + } + if prof.Office_hours == nil { + prof.Office_hours = []schema.Meeting{} + } + if prof.Sections == nil { + prof.Sections = []primitive.ObjectID{} + } +} + func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]*goquery.Selection) []primitive.ObjectID { professorText := utils.TrimWhitespace(rowInfo["Instructor(s):"].Text()) professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1) @@ -28,20 +97,34 @@ func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]*goquery.S } profKey := firstName + lastName + email := utils.TrimWhitespace(match[3]) prof, profExists := Professors[profKey] if profExists { prof.Sections = append(prof.Sections, sectionId) + if prof.Email == "" { + prof.Email = email + } profRefs = append(profRefs, prof.Id) continue } - prof = &schema.Professor{} - prof.Id = primitive.NewObjectID() - prof.First_name = firstName - prof.Last_name = lastName + if profByEmail, emailMatch := findProfessorByEmail(email); emailMatch { + profByEmail.Sections = append(profByEmail.Sections, sectionId) + if profByEmail.Email == "" { + profByEmail.Email = email + } + if _, exists := ProfessorIDMap[profByEmail.Id]; !exists { + ProfessorIDMap[profByEmail.Id] = profKey + } + Professors[profKey] = profByEmail + profRefs = append(profRefs, profByEmail.Id) + continue + } + + prof = newProfessor(firstName, lastName) prof.Titles = []string{utils.TrimWhitespace(match[2])} - prof.Email = utils.TrimWhitespace(match[3]) + prof.Email = email prof.Sections = []primitive.ObjectID{sectionId} profRefs = append(profRefs, prof.Id) Professors[profKey] = prof @@ -49,3 +132,238 @@ func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]*goquery.S } return profRefs } + +func findProfessorByEmail(email string) (*schema.Professor, bool) { + normalized := strings.TrimSpace(strings.ToLower(email)) + if normalized == "" { + return nil, false + } + + for _, prof := range Professors { + if prof == nil { + continue + } + if strings.TrimSpace(strings.ToLower(prof.Email)) == normalized { + return prof, true + } + } + + return nil, false +} + +func splitFullName(fullName string) (string, string) { + parts := strings.Fields(strings.TrimSpace(fullName)) + if len(parts) == 0 { + return "", "" + } + if len(parts) == 1 { + return parts[0], "" + } + return strings.Join(parts[:len(parts)-1], " "), parts[len(parts)-1] +} + +func parseAPILocation(text string) schema.Location { + normalized := strings.TrimSpace(text) + if normalized == "" { + return schema.Location{} + } + + var building string + var room string + + submatches := apiPrimaryLocationRegex.FindStringSubmatch(normalized) + if submatches == nil { + submatches = apiFallbackLocationRegex.FindStringSubmatch(strings.ReplaceAll(normalized, " ", "")) + if submatches == nil { + return schema.Location{} + } + building = submatches[1] + room = fmt.Sprintf("%s.%s", submatches[2], submatches[3]) + } else { + building = submatches[1] + room = submatches[2] + } + + return schema.Location{ + Building: building, + Room: room, + Map_uri: fmt.Sprintf("https://locator.utdallas.edu/%s_%s", building, room), + } +} + +func collectTitles(row profileIndexRow) []string { + titles := make([]string, 0, 8) + if row.Name != "" { + titles = append(titles, strings.TrimSpace(row.Name)) + } + + for _, info := range row.Information { + for _, candidate := range []*string{info.Data.Title, info.Data.SecondaryTitle, info.Data.TertiaryTitle, info.Data.DistinguishedTitle} { + trimmed := trimNullableString(candidate) + if trimmed == "" { + continue + } + if !containsString(titles, trimmed) { + titles = append(titles, trimmed) + } + } + } + + return titles +} + +func bestInformationData(items []profileInformation) profileInformationData { + if len(items) == 0 { + return profileInformationData{} + } + + best := items[0].Data + bestScore := informationScore(best) + + for _, item := range items[1:] { + score := informationScore(item.Data) + if score > bestScore { + best = item.Data + bestScore = score + } + } + + return best +} + +func informationScore(data profileInformationData) int { + score := 0 + for _, value := range []string{ + trimNullableString(data.Email), + trimNullableString(data.Phone), + trimNullableString(data.Location), + trimNullableString(data.URL), + trimNullableString(data.SecondaryURL), + trimNullableString(data.TertiaryURL), + trimNullableString(data.QuaternaryURL), + trimNullableString(data.QuinaryURL), + trimNullableString(data.Title), + trimNullableString(data.SecondaryTitle), + trimNullableString(data.TertiaryTitle), + trimNullableString(data.DistinguishedTitle), + trimNullableString(data.ProfileSummary), + trimNullableString(data.AcceptingStudents), + trimNullableString(data.NotAcceptingStudents), + } { + if strings.TrimSpace(value) != "" { + score++ + } + } + + return score +} + +func bestLocation(items []profileInformation) schema.Location { + for _, item := range items { + location := parseAPILocation(trimNullableString(item.Data.Location)) + if location.Building != "" || location.Room != "" { + return location + } + } + + return schema.Location{} +} + +func bestProfileURI(row profileIndexRow) string { + if trimmed := strings.TrimSpace(row.URL); trimmed != "" { + return trimmed + } + + for _, info := range row.Information { + for _, candidate := range []*string{info.Data.URL, info.Data.SecondaryURL, info.Data.TertiaryURL, info.Data.QuaternaryURL, info.Data.QuinaryURL} { + trimmed := trimNullableString(candidate) + if trimmed != "" { + return trimmed + } + } + } + + for _, candidate := range []string{row.APIURL} { + trimmed := strings.TrimSpace(candidate) + if trimmed != "" { + return trimmed + } + } + + return "" +} + +func bestImageURI(row profileIndexRow) string { + if trimmed := strings.TrimSpace(row.ImageURL); trimmed != "" { + return trimmed + } + + for _, media := range row.Media { + for _, key := range []string{"url", "image_url", "src", "uri"} { + if raw, exists := media[key]; exists { + if str, ok := raw.(string); ok { + trimmed := strings.TrimSpace(str) + if trimmed != "" { + return trimmed + } + } + } + } + } + + return "" +} + +func mergeProfileProfessor(target, source *schema.Professor) { + if target == nil || source == nil { + return + } + if target.Titles == nil { + target.Titles = []string{} + } + + for _, title := range source.Titles { + if !containsString(target.Titles, title) { + target.Titles = append(target.Titles, title) + } + } + + if target.Email == "" { + target.Email = source.Email + } + if target.Phone_number == "" { + target.Phone_number = source.Phone_number + } + if target.Office.Building == "" && target.Office.Room == "" && target.Office.Map_uri == "" { + target.Office = source.Office + } + if target.Profile_uri == "" { + target.Profile_uri = source.Profile_uri + } + if target.Image_uri == "" { + target.Image_uri = source.Image_uri + } + if target.Office_hours == nil { + target.Office_hours = source.Office_hours + } + if target.Sections == nil { + target.Sections = source.Sections + } +} + +func containsString(values []string, target string) bool { + for _, value := range values { + if value == target { + return true + } + } + return false +} + +func trimNullableString(value *string) string { + if value == nil { + return "" + } + + return strings.TrimSpace(*value) +} \ No newline at end of file diff --git a/parser/profileLoader.go b/parser/profileLoader.go index 0d16aa7..2739a2c 100644 --- a/parser/profileLoader.go +++ b/parser/profileLoader.go @@ -3,13 +3,65 @@ package parser import ( "encoding/json" "fmt" + "io" "log" "os" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" ) +// LoadProfiles reads scraped profile API data and populates the package maps. +func LoadProfiles(inDir string) bool { + path := fmt.Sprintf("%s/%s", inDir, profilesRawFileName) + fptr, err := os.Open(path) + if err != nil { + return false + } + defer fptr.Close() + + payload, err := io.ReadAll(fptr) + if err != nil { + log.Printf("Failed to read profiles JSON: %v", err) + return false + } + + rows, err := decodeProfileRows(payload) + if err != nil { + log.Printf("Failed to decode profiles JSON: %v", err) + return false + } + + loadedCount := 0 + for _, row := range rows { + if !row.Public { + continue + } + + prof := buildProfessorFromRow(row) + if prof == nil { + continue + } + + professorKey := prof.First_name + prof.Last_name + if existing, exists := Professors[professorKey]; exists { + mergeProfileProfessor(existing, prof) + continue + } + Professors[professorKey] = prof + ProfessorIDMap[prof.Id] = professorKey + loadedCount++ + } + + log.Printf("Loaded %d profiles from %s.", loadedCount, profilesRawFileName) + return true +} + func loadProfiles(inDir string) { + if LoadProfiles(inDir) { + return + } + fptr, err := os.Open(fmt.Sprintf("%s/profiles.json", inDir)) if err != nil { log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.") @@ -49,3 +101,15 @@ func loadProfiles(inDir string) { log.Printf("Loaded %d profiles!", profileCount) fptr.Close() } + +// ParseProfiles loads profile data and writes only the professors output file. +func ParseProfiles(inDir string, outDir string) { + loadProfiles(inDir) + syncProfessorSectionLinks() + + if err := os.MkdirAll(outDir, 0777); err != nil { + panic(err) + } + + utils.WriteJSON(fmt.Sprintf("%s/professors.json", outDir), utils.GetMapValues(Professors)) +} diff --git a/parser/profiles.go b/parser/profiles.go new file mode 100644 index 0000000..39754e6 --- /dev/null +++ b/parser/profiles.go @@ -0,0 +1,148 @@ +package parser + +import ( + "encoding/json" + "fmt" + + "go.mongodb.org/mongo-driver/bson/primitive" +) + +const profilesRawFileName = "profiles.json" + +type profileIndexResponse struct { + Count int `json:"count"` + Profile []profileIndexRow `json:"profile"` +} + +type profileIndexRow struct { + ID int `json:"id"` + FullName string `json:"full_name"` + FirstName string `json:"first_name"` + LastName string `json:"last_name"` + Slug string `json:"slug"` + Public bool `json:"public"` + URL string `json:"url"` + Name string `json:"name"` + ImageURL string `json:"image_url"` + APIURL string `json:"api_url"` + Media []map[string]any `json:"media"` + Information []profileInformation `json:"information"` + Areas []profileArea `json:"areas"` +} + +type profileInformation struct { + Data profileInformationData `json:"data"` +} + +type profileInformationData struct { + URL *string `json:"url"` + SecondaryURL *string `json:"secondary_url"` + TertiaryURL *string `json:"tertiary_url"` + QuaternaryURL *string `json:"quaternary_url"` + QuinaryURL *string `json:"quinary_url"` + Email *string `json:"email"` + Phone *string `json:"phone"` + Title *string `json:"title"` + SecondaryTitle *string `json:"secondary_title"` + TertiaryTitle *string `json:"tertiary_title"` + DistinguishedTitle *string `json:"distinguished_title"` + Location *string `json:"location"` + ProfileSummary *string `json:"profile_summary"` + AcceptingStudents *string `json:"accepting_students"` + NotAcceptingStudents *string `json:"not_accepting_students"` +} + +type profileArea struct { + Data profileAreaData `json:"data"` +} + +type profileAreaData struct { + Title string `json:"title"` + Description string `json:"description"` +} + +func decodeProfileRows(payload []byte) ([]profileIndexRow, error) { + var rows []profileIndexRow + if err := json.Unmarshal(payload, &rows); err == nil { + return rows, nil + } + + var response profileIndexResponse + if err := json.Unmarshal(payload, &response); err == nil { + return response.Profile, nil + } + + return nil, fmt.Errorf("unsupported profiles JSON shape") +} + +// syncProfessorSectionLinks ensures professor->section links match parsed section references. +// Course connections are derived transitively through each section's Course_reference. +func syncProfessorSectionLinks() { + if len(Professors) == 0 || len(Sections) == 0 { + return + } + + for _, prof := range Professors { + if prof == nil { + continue + } + if prof.Sections == nil { + prof.Sections = []primitive.ObjectID{} + } + } + + for sectionID, section := range Sections { + if section == nil { + continue + } + + for _, profID := range section.Professors { + profKey, ok := ProfessorIDMap[profID] + if !ok { + continue + } + prof, exists := Professors[profKey] + if !exists || prof == nil { + continue + } + if !containsObjectID(prof.Sections, sectionID) { + prof.Sections = append(prof.Sections, sectionID) + } + } + } + + for _, prof := range Professors { + if prof == nil { + continue + } + prof.Sections = dedupeObjectIDs(prof.Sections) + } +} + +func containsObjectID(values []primitive.ObjectID, target primitive.ObjectID) bool { + for _, value := range values { + if value == target { + return true + } + } + + return false +} + +func dedupeObjectIDs(values []primitive.ObjectID) []primitive.ObjectID { + if len(values) < 2 { + return values + } + + seen := make(map[primitive.ObjectID]struct{}, len(values)) + result := make([]primitive.ObjectID, 0, len(values)) + for _, value := range values { + if _, exists := seen[value]; exists { + continue + } + seen[value] = struct{}{} + result = append(result, value) + } + + return result +} diff --git a/parser/profiles_test.go b/parser/profiles_test.go new file mode 100644 index 0000000..0bef1a1 --- /dev/null +++ b/parser/profiles_test.go @@ -0,0 +1,85 @@ +package parser + +import ( + "testing" +) + +func TestBestInformationDataChoosesMostCompleteEntry(t *testing.T) { + t.Parallel() + + items := []profileInformation{ + {Data: profileInformationData{Email: profileStrPtr(""), Phone: profileStrPtr(""), Location: profileStrPtr("")}}, + {Data: profileInformationData{Email: profileStrPtr("alice@utdallas.edu"), Phone: profileStrPtr("972-000-0000"), URL: profileStrPtr("https://example.com"), Title: profileStrPtr("Professor")}}, + {Data: profileInformationData{Email: profileStrPtr("bob@utdallas.edu")}}, + } + + best := bestInformationData(items) + if got := trimNullableString(best.Email); got != "alice@utdallas.edu" { + t.Fatalf("expected most complete information entry, got %q", got) + } +} + +func TestBestProfileURIUsesFallbacks(t *testing.T) { + t.Parallel() + + row := profileIndexRow{ + APIURL: "https://profiles.utdallas.edu/api/v1?person=alice", + Information: []profileInformation{ + {Data: profileInformationData{SecondaryURL: profileStrPtr("https://profiles.utdallas.edu/alice")}}, + }, + } + + uri := bestProfileURI(row) + if uri != "https://profiles.utdallas.edu/alice" { + t.Fatalf("expected secondary URL fallback, got %q", uri) + } +} + +func TestBestImageURIUsesMediaFallback(t *testing.T) { + t.Parallel() + + row := profileIndexRow{ + Media: []map[string]any{ + {"id": 11}, + {"image_url": "https://profiles.utdallas.edu/img/alice.jpg"}, + }, + } + + uri := bestImageURI(row) + if uri != "https://profiles.utdallas.edu/img/alice.jpg" { + t.Fatalf("expected media image URL fallback, got %q", uri) + } +} + +func TestBuildProfessorFromRowUsesBestLocationAndFallbackURI(t *testing.T) { + t.Parallel() + + row := profileIndexRow{ + FirstName: "Alice", + LastName: "Example", + Public: true, + Information: []profileInformation{ + {Data: profileInformationData{Location: profileStrPtr("Not A Parsable Location"), Email: profileStrPtr("alice@utdallas.edu")}}, + {Data: profileInformationData{Location: profileStrPtr("ECS 3.201"), SecondaryURL: profileStrPtr("https://profiles.utdallas.edu/alice")}}, + }, + Media: []map[string]any{{"url": "https://profiles.utdallas.edu/img/alice2.jpg"}}, + } + + prof := buildProfessorFromRow(row) + if prof == nil { + t.Fatal("expected professor to be built") + } + if prof.Office.Building != "ECS" || prof.Office.Room != "3.201" { + t.Fatalf("expected parsed fallback location ECS 3.201, got %+v", prof.Office) + } + if prof.Profile_uri != "https://profiles.utdallas.edu/alice" { + t.Fatalf("expected fallback profile URI, got %q", prof.Profile_uri) + } + if prof.Image_uri != "https://profiles.utdallas.edu/img/alice2.jpg" { + t.Fatalf("expected media fallback image URI, got %q", prof.Image_uri) + } +} + +func profileStrPtr(value string) *string { + return &value +} diff --git a/scrapers/profiles.go b/scrapers/profiles.go index 59ec59d..68c3c70 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -5,301 +5,364 @@ package scrapers import ( - "context" "encoding/json" - "errors" "fmt" "log" + "net/http" + "net/url" "os" - "regexp" - "strconv" + "path/filepath" "strings" + "time" +) + +const profileBaseURL string = "https://profiles.utdallas.edu/api/v1" - "github.com/UTDNebula/api-tools/utils" - "github.com/UTDNebula/nebula-api/api/schema" - "github.com/chromedp/cdproto/cdp" - "github.com/chromedp/cdproto/runtime" - "github.com/chromedp/chromedp" - "go.mongodb.org/mongo-driver/bson/primitive" +const ( + profilesRawFileName = "profiles.json" ) -// BASE_URL is the root listing endpoint for UTD professor profiles. -const BASE_URL string = "https://profiles.utdallas.edu/browse?page=" +const ( + profileBatchSize = 25 + profileRequestTimeout = 30 * time.Second + profileRequestDelay = 200 * time.Millisecond +) -var primaryLocationRegex *regexp.Regexp = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`) -var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile(`^([A-z]+)(\d+)\.?(\d{3}[A-z]?)$`) +type profileIndexResponse struct { + Count int `json:"count"` + Profile []profileIndexRecord `json:"profile"` +} -func parseLocation(text string) schema.Location { - var building string - var room string +type profileIndexRecord struct { + ID int `json:"id"` + FullName string `json:"full_name"` + FirstName string `json:"first_name"` + LastName string `json:"last_name"` + Slug string `json:"slug"` + Public bool `json:"public"` + URL string `json:"url"` + Name string `json:"name"` + ImageURL string `json:"image_url"` + APIURL string `json:"api_url"` + Media []profileMedia `json:"media"` +} - submatches := primaryLocationRegex.FindStringSubmatch(text) - if submatches == nil { - submatches = fallbackLocationRegex.FindStringSubmatch(text) - if submatches == nil { - return schema.Location{} - } else { - building = submatches[1] - room = fmt.Sprintf("%s.%s", submatches[2], submatches[3]) - } - } else { - building = submatches[1] - room = submatches[2] +type profileRawRecord struct { + ID int `json:"id"` + FullName string `json:"full_name"` + FirstName string `json:"first_name"` + LastName string `json:"last_name"` + Slug string `json:"slug"` + Public bool `json:"public"` + URL string `json:"url"` + Name string `json:"name"` + ImageURL string `json:"image_url"` + APIURL string `json:"api_url"` + Media []profileMedia `json:"media"` + Information []profileInfoBlock `json:"information"` + Areas []profileAreaBlock `json:"areas"` +} + +type profileMedia struct { + ID int `json:"id"` + ModelID int `json:"model_id"` + UUID string `json:"uuid"` + ModelType string `json:"model_type"` + CollectionName string `json:"collection_name"` + Name string `json:"name"` + FileName string `json:"file_name"` + MimeType string `json:"mime_type"` + Disk string `json:"disk"` + ConversionsDisk string `json:"conversions_disk"` + Size int `json:"size"` + Manipulations []any `json:"manipulations"` + CustomProperties []any `json:"custom_properties"` + GeneratedConversions profileGeneratedConversions `json:"generated_conversions"` + ResponsiveImages any `json:"responsive_images"` + OrderColumn int `json:"order_column"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` + OriginalURL string `json:"original_url"` + PreviewURL string `json:"preview_url"` +} + +type profileGeneratedConversions struct { + Large bool `json:"large"` + Thumb bool `json:"thumb"` + Medium bool `json:"medium"` +} + +type profileInformationData struct { + URL *string `json:"url"` + Email *string `json:"email"` + Phone *string `json:"phone"` + Title *string `json:"title"` + ORCID *string `json:"orc_id"` + Location *string `json:"location"` + URLName *string `json:"url_name"` + QuinaryURL *string `json:"quinary_url"` + FancyHeader *string `json:"fancy_header"` + TertiaryURL *string `json:"tertiary_url"` + SecondaryURL *string `json:"secondary_url"` + ORCIDManaged *string `json:"orc_id_managed"` + QuaternaryURL *string `json:"quaternary_url"` + TertiaryTitle *string `json:"tertiary_title"` + ProfileSummary *string `json:"profile_summary"` + SecondaryTitle *string `json:"secondary_title"` + QuinaryURLName *string `json:"quinary_url_name"` + TertiaryURLName *string `json:"tertiary_url_name"` + AcceptingStudents *string `json:"accepting_students"` + FancyHeaderRight *string `json:"fancy_header_right"` + SecondaryURLName *string `json:"secondary_url_name"` + DistinguishedTitle *string `json:"distinguished_title"` + QuaternaryURLName *string `json:"quaternary_url_name"` + NotAcceptingStudents *string `json:"not_accepting_students"` + AcceptingGradStudents *string `json:"accepting_grad_students"` + ShowAcceptingStudents *string `json:"show_accepting_students"` + NotAcceptingGradStudents *string `json:"not_accepting_grad_students"` + ShowNotAcceptingStudents *string `json:"show_not_accepting_students"` +} + +type profileInfoBlock struct { + ID int `json:"id"` + ProfileID int `json:"profile_id"` + Type string `json:"type"` + SortOrder int `json:"sort_order"` + Data profileInformationData `json:"data"` + Public bool `json:"public"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` +} + +type profileAreaData struct { + Title string `json:"title"` + Description string `json:"description"` +} + +type profileAreaBlock struct { + ID int `json:"id"` + ProfileID int `json:"profile_id"` + Type string `json:"type"` + SortOrder int `json:"sort_order"` + Data profileAreaData `json:"data"` + Public bool `json:"public"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` +} + +type profileDetailsResponse struct { + Count int `json:"count"` + Profile []profileRawRecord `json:"profile"` +} + +// ScrapeProfiles fetches the raw profile API response and writes it to disk. +func ScrapeProfiles(outDir string) { + log.Print("Beginning profile scrape.") + + client := &http.Client{Timeout: profileRequestTimeout} + + indexResponse, err := fetchProfileIndex(client) + if err != nil { + log.Printf("Failed to retrieve profile index: %v", err) + return } - return schema.Location{ - Building: building, - Room: room, - Map_uri: fmt.Sprintf("https://locator.utdallas.edu/%s_%s", building, room), + if len(indexResponse.Profile) == 0 { + log.Print("Profile API returned no profiles.") + return } -} -func parseList(list []string) (string, schema.Location) { - var phoneNumber string - var office schema.Location - - for _, element := range list { - element = strings.TrimSpace(element) - utils.VPrintf("Element is: %s", element) - if strings.Contains(element, "-") { - phoneNumber = element - } else if primaryLocationRegex.MatchString(element) || fallbackLocationRegex.MatchString(element) { - utils.VPrintf("Element match is: %s", element) - office = parseLocation(element) - break + slugs := make([]string, 0, len(indexResponse.Profile)) + for _, row := range indexResponse.Profile { + slug := strings.TrimSpace(row.Slug) + if slug == "" { + continue } + slugs = append(slugs, slug) } + slugs = dedupeStrings(slugs) - return phoneNumber, office -} + if len(slugs) == 0 { + log.Print("Profile API index contained no valid slugs.") + return + } -func parseName(fullName string) (string, string) { - commaIndex := strings.Index(fullName, ",") - if commaIndex != -1 { - fullName = fullName[:commaIndex] + log.Printf("Retrieved %d profile slugs.", len(slugs)) + + detailedProfiles := make([]profileRawRecord, 0, len(slugs)) + log.Printf("Pulling profile details by person slug in batches of %d.", profileBatchSize) + for i := 0; i < len(slugs); i += profileBatchSize { + end := i + profileBatchSize + if end > len(slugs) { + end = len(slugs) + } + + batch := slugs[i:end] + batchProfiles, fetchErr := fetchProfileDetails(client, batch) + if fetchErr != nil { + log.Printf("Failed to retrieve profile detail batch %d-%d: %v", i+1, end, fetchErr) + continue + } + + detailedProfiles = append(detailedProfiles, batchProfiles...) + log.Printf("Fetched profile detail batch %d-%d (%d records).", i+1, end, len(batchProfiles)) + + if end < len(slugs) { + time.Sleep(profileRequestDelay) + } } - names := strings.Split(fullName, " ") - ultimateName := strings.ToLower(names[len(names)-1]) - if len(names) > 2 && (ultimateName == "jr" || - ultimateName == "sr" || - ultimateName == "I" || - ultimateName == "II" || - ultimateName == "III") { - names = names[:len(names)-1] + + detailedProfiles = dedupeProfiles(detailedProfiles) + for i := range detailedProfiles { + if detailedProfiles[i].Media == nil { + detailedProfiles[i].Media = []profileMedia{} + } + if detailedProfiles[i].Information == nil { + detailedProfiles[i].Information = []profileInfoBlock{} + } + if detailedProfiles[i].Areas == nil { + detailedProfiles[i].Areas = []profileAreaBlock{} + } + for j := range detailedProfiles[i].Media { + if detailedProfiles[i].Media[j].Manipulations == nil { + detailedProfiles[i].Media[j].Manipulations = []any{} + } + if detailedProfiles[i].Media[j].CustomProperties == nil { + detailedProfiles[i].Media[j].CustomProperties = []any{} + } + } + } + + if err := os.MkdirAll(outDir, 0777); err != nil { + log.Printf("Failed to create output directory: %v", err) + return } - return names[0], names[len(names)-1] -} -func getNodeText(node *cdp.Node) string { - if len(node.Children) == 0 { - return "" + outPath := filepath.Join(outDir, profilesRawFileName) + detailOutput := profileDetailsResponse{Count: len(detailedProfiles), Profile: detailedProfiles} + if err := writePrettyJSON(outPath, detailOutput); err != nil { + log.Printf("Failed to write profile detail output file: %v", err) + return } - return node.Children[0].NodeValue + + log.Printf("Wrote %d raw profiles to %s", len(detailedProfiles), outPath) } -func scrapeProfessorLinks(chromedpCtx context.Context) []string { - var pageLinks []*cdp.Node - _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(BASE_URL+"1"), - chromedp.QueryAfter(".page-link", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - pageLinks = nodes - return nil - }, - ), - ) +func fetchProfileIndex(client *http.Client) (*profileIndexResponse, error) { + req, err := http.NewRequest(http.MethodGet, profileBaseURL, nil) if err != nil { - panic(err) + return nil, err } - numPages, err := strconv.Atoi(getNodeText(pageLinks[len(pageLinks)-2])) + resp, err := client.Do(req) if err != nil { - panic(err) + return nil, err } + defer resp.Body.Close() - professorLinks := make([]string, 0, numPages) - for curPage := 1; curPage <= numPages; curPage++ { - _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(BASE_URL+strconv.Itoa(curPage)), - chromedp.QueryAfter("//h5[@class='card-title profile-name']//a", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - href, hasHref := node.Attribute("href") - if !hasHref { - return errors.New("professor card was missing an href") - } - professorLinks = append(professorLinks, href) - } - return nil - }, - ), - ) - if err != nil { - panic(err) - } + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) } - return professorLinks -} - -// ScrapeProfiles navigates UTD profile listings and writes professor metadata to JSON. -func ScrapeProfiles(outDir string) { + var decoded profileIndexResponse + if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil { + return nil, err + } - chromedpCtx, cancel := utils.InitChromeDp() - defer cancel() + return &decoded, nil +} - err := os.MkdirAll(outDir, 0777) +func writePrettyJSON(path string, data any) error { + fptr, err := os.Create(path) if err != nil { - panic(err) + return err } + defer fptr.Close() - var professors []schema.Professor + encoder := json.NewEncoder(fptr) + encoder.SetEscapeHTML(false) + encoder.SetIndent("", " ") + if err := encoder.Encode(data); err != nil { + return err + } - log.Print("Scraping professor links...") - professorLinks := scrapeProfessorLinks(chromedpCtx) - log.Print("Scraped professor links!") + return nil +} - for _, link := range professorLinks { +func fetchProfileDetails(client *http.Client, slugs []string) ([]profileRawRecord, error) { + if len(slugs) == 0 { + return []profileRawRecord{}, nil + } - // Navigate to the link and get the names - var firstName, lastName string + requestURL := buildProfileDetailsRequestURL(slugs) + req, err := http.NewRequest(http.MethodGet, requestURL, nil) + if err != nil { + return nil, err + } - utils.VPrint("Scraping name...") + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() - _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate(link), - chromedp.ActionFunc(func(ctx context.Context) error { - var text string - err := chromedp.Text("div.contact_info>h1", &text).Do(ctx) - firstName, lastName = parseName(text) - return err - }), - ) - if err != nil { - panic(err) - } + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) + } - // Get the image uri - var imageUri string - - utils.VPrint("Scraping imageUri...") - - err = chromedp.Run(chromedpCtx, - chromedp.ActionFunc(func(ctx context.Context) error { - var attributes map[string]string - err := chromedp.Attributes("//img[@class='profile_photo']", &attributes, chromedp.AtLeast(0)).Do(ctx) - if err == nil { - var hasSrc bool - imageUri, hasSrc = attributes["src"] - if !hasSrc { - return errors.New("no src found for imageUri") - } - } - return err - }), - ) - if err != nil { - err = chromedp.Run(chromedpCtx, - chromedp.ActionFunc(func(ctx context.Context) error { - var attributes map[string]string - err := chromedp.Attributes("//div[@class='profile-header fancy_header ']", &attributes, chromedp.AtLeast(0)).Do(ctx) - if err == nil { - var hasStyle bool - imageUri, hasStyle = attributes["style"] - if !hasStyle { - return errors.New("no style found for imageUri") - } - imageUri = imageUri[23 : len(imageUri)-3] - } - return err - }), - ) - if err != nil { - panic(err) - } - } + var decoded profileDetailsResponse + if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil { + return nil, err + } - // Get the titles - titles := make([]string, 0, 3) - - utils.VPrint("Scraping titles...") - - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter("div.profile-title", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes { - tempText := getNodeText(node) - if !strings.Contains(tempText, "$") { - titles = append(titles, tempText) - } - } - return nil - }, chromedp.AtLeast(0), - ), - ) - if err != nil { - continue - } + return decoded.Profile, nil +} - // Get the email - var email string - utils.VPrint("Scraping email...") +func dedupeStrings(values []string) []string { + if len(values) < 2 { + return values + } - err = chromedp.Run(chromedpCtx, - chromedp.Text("//a[contains(@id,'☄️')]", &email, chromedp.AtLeast(0)), - ) - if err != nil { + seen := make(map[string]struct{}, len(values)) + result := make([]string, 0, len(values)) + for _, value := range values { + if _, exists := seen[value]; exists { continue } + seen[value] = struct{}{} + result = append(result, value) + } - // Get the phone number and office location - var texts []string - - utils.VPrint("Scraping list text...") - - err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter("div.contact_info>div ~ div", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - var tempText string - err := chromedp.Text("div.contact_info>div ~ div", &tempText).Do(ctx) - texts = strings.Split(tempText, "\n") - return err - }, - ), - ) - if err != nil { - panic(err) - } + return result +} - utils.VPrint("Parsing list...") - phoneNumber, office := parseList(texts) - utils.VPrintf("Parsed list! #: %s, Office: %v", phoneNumber, office) - - professors = append(professors, schema.Professor{ - Id: primitive.NewObjectID(), - First_name: firstName, - Last_name: lastName, - Titles: titles, - Email: email, - Phone_number: phoneNumber, - Office: office, - Profile_uri: link, - Image_uri: imageUri, - Office_hours: []schema.Meeting{}, - Sections: []primitive.ObjectID{}, - }) - - utils.VPrintf("Scraped profile for %s %s!", firstName, lastName) +func buildProfileDetailsRequestURL(slugs []string) string { + params := url.Values{} + params.Set("person", strings.Join(slugs, ";")) + params.Set("with_data", "1") + params.Set("data_type", "information;areas") + return fmt.Sprintf("%s?%s", profileBaseURL, params.Encode()) +} + +func dedupeProfiles(values []profileRawRecord) []profileRawRecord { + if len(values) < 2 { + return values } - // Write professor data to output file - fptr, err := os.Create(fmt.Sprintf("%s/profiles.json", outDir)) - if err != nil { - panic(err) + seen := make(map[string]struct{}, len(values)) + result := make([]profileRawRecord, 0, len(values)) + for _, value := range values { + key := strings.TrimSpace(strings.ToLower(value.Slug)) + if key == "" { + key = fmt.Sprintf("id:%d", value.ID) + } + if _, exists := seen[key]; exists { + continue + } + seen[key] = struct{}{} + result = append(result, value) } - encoder := json.NewEncoder(fptr) - encoder.SetIndent("", "\t") - encoder.Encode(professors) - fptr.Close() + + return result } diff --git a/scrapers/profiles_test.go b/scrapers/profiles_test.go new file mode 100644 index 0000000..c3fe1dd --- /dev/null +++ b/scrapers/profiles_test.go @@ -0,0 +1,45 @@ +package scrapers + +import ( + "net/url" + "testing" +) + +func TestBuildProfileDetailsRequestURL(t *testing.T) { + t.Parallel() + + raw := buildProfileDetailsRequestURL([]string{"herve.abdi", "nimali.abeykoon"}) + u, err := url.Parse(raw) + if err != nil { + t.Fatalf("failed to parse URL: %v", err) + } + q := u.Query() + + if q.Get("person") != "herve.abdi;nimali.abeykoon" { + t.Fatalf("unexpected person query value: %q", q.Get("person")) + } + if q.Get("with_data") != "1" { + t.Fatalf("unexpected with_data query value: %q", q.Get("with_data")) + } + if q.Get("data_type") != "information;areas" { + t.Fatalf("unexpected data_type query value: %q", q.Get("data_type")) + } +} + + +func TestDedupeProfiles(t *testing.T) { + t.Parallel() + + items := []profileRawRecord{ + {ID: 1, Slug: "alice.example"}, + {ID: 2, Slug: "ALICE.EXAMPLE"}, + {ID: 3, Slug: ""}, + {ID: 3, Slug: ""}, + {ID: 4, Slug: "bob.example"}, + } + + got := dedupeProfiles(items) + if len(got) != 3 { + t.Fatalf("expected 3 unique profiles, got %d", len(got)) + } +}