Skip to content

Commit 4efbdb7

Browse files
committed
Initial Checkin of search code
This PR has two related features 1. A script in `dev_scripts` to parse all of the sample problems and the POD to build up a "database" of non-common words for a given problem/POD page. 2. Add search boxes to the sample-problems/POD pages to return pages that match the search criteria.
1 parent 252a36a commit 4efbdb7

File tree

9 files changed

+1686
-1
lines changed

9 files changed

+1686
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ htdocs/themes/*
3636
DATA/*
3737
!DATA/uploads
3838
DATA/uploads/*
39+
!htdocs/DATA/search.json
3940
!*README*
4041

4142
docker-compose.yml

bin/dev_scripts/build-search-db.pl

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
#!/usr/bin/env perl
2+
3+
=head1 NAME
4+
5+
build-search-db.pl - Build a search file for the samples problems and POD files.
6+
7+
=head1 SYNOPSIS
8+
9+
build-search-db.pl [options]
10+
11+
Options:
12+
-p|--pg-root Directory containing a git clone of pg.
13+
If this option is not set, then the environment
14+
variable $PG_ROOT will be used if it is set.
15+
-f|--json-file Location (relative to WW_ROOT) to store the resulting JSON file.
16+
Default value is htdocs/DATA/search.json
17+
-s|--sample-prob-dir Location (relative to $PG_ROOT) where the sample problems are located.
18+
Default value is tutorial/samples-problems
19+
-b|--build One of (all, macros, samples) to determine if the macros, sample
20+
problems or both should be scraped for data.
21+
-v|--verbose Setting this flag provides details as the script runs.
22+
23+
Note that --pg-root must be provided or the PG_ROOT environment variable set
24+
if the POD for pg is desired.
25+
26+
=head1 DESCRIPTION
27+
28+
Read through all of the files in $PG_ROOT/tutorial/samples-problems and the POD in the macro files.
29+
The result is a JSON file containing information about every file to be searched for in the sample-problems
30+
space.
31+
32+
=cut
33+
34+
use strict;
35+
use warnings;
36+
37+
use feature "say";
38+
39+
use Getopt::Long qw(:config bundling);
40+
use File::Find;
41+
use Mojo::JSON qw(encode_json);
42+
use Mojo::File qw(path curfile);
43+
use Pod::Simple::SimpleTree;
44+
45+
my $build = "all";
46+
my $pg_root = $ENV{PG_ROOT};
47+
# These are the default sample problem directory and JSON file.
48+
my $dir = "tutorial/sample-problems";
49+
my $json_file = "htdocs/DATA/search.json";
50+
my $verbose = 0;
51+
52+
GetOptions(
53+
'p|pg-root=s' => \$pg_root,
54+
'f|json-file=s' => \$json_file,
55+
's|sample-prob-dir=s' => \$dir,
56+
'b|build=s' => \$build,
57+
'v|verbose+' => \$verbose
58+
);
59+
60+
die "The build options must be one of (all, macros, samples). The value $build is not valid."
61+
if ((grep { $_ eq $build } qw/all macros samples/) == 0);
62+
63+
my $ww_root = $ENV{WW_ROOT};
64+
$ww_root = Mojo::File->new(curfile->dirname, "..", "..")->realpath unless defined($ww_root);
65+
66+
die "ww_root: $ww_root is not a directory" unless -d $ww_root;
67+
68+
$dir = "$pg_root/$dir";
69+
$json_file = path("$ww_root/$json_file");
70+
71+
my $json_dir = $json_file->dirname;
72+
$json_dir->make_path unless -d $json_dir;
73+
74+
if ($verbose) {
75+
say "Running script build-search-data with the following options:";
76+
say " pg-root: $pg_root";
77+
say " ww-root: $ww_root";
78+
say " build: $build";
79+
say " dir: $dir";
80+
say " json_file: $json_file";
81+
}
82+
83+
#
84+
my $stop_words = {};
85+
86+
# Load the Stop Words File
87+
open(my $FH, '<:encoding(UTF-8)', "$ww_root/bin/dev_scripts/stop-words-en.txt") or do {
88+
warn qq{Could not open file "$ww_root/bin/dev_scripts/stop-words-en.txt": $!};
89+
};
90+
my @stop_words = <$FH>;
91+
chomp for (@stop_words);
92+
93+
# Store all of search info for each file and store as an array of hashrefs.
94+
my @search_terms;
95+
96+
my $index = 1; # set an index for each file.
97+
98+
sub processFile {
99+
return unless $_ =~ /\.pg$/;
100+
say "Processing $_" if $verbose;
101+
102+
my $filename = $_;
103+
104+
open(my $FH, '<:encoding(UTF-8)', $File::Find::name) or do {
105+
warn qq{Could not open file "$File::Find::name": $!};
106+
return {};
107+
};
108+
my @file_contents = <$FH>;
109+
close $FH;
110+
111+
my (@words, @kw, @macros, @subjects, $name, $description);
112+
113+
# For each line if it is documentation, or a loadMacors or a KEYWORDS line
114+
while (my $line = shift @file_contents) {
115+
chomp($line);
116+
# This processes all of the documentation lines within a sample problem.
117+
if ($line =~ /^#:[^%]/) {
118+
$line =~ s/^#:\s+//;
119+
push(@words, processLine($line));
120+
} elsif ($line =~ /^#:%\s*(\w+)\s*=\s*(.*)\s*$/) {
121+
# Store the name of the sample problem and the subjects.
122+
$name = $2 if $1 eq 'name';
123+
if ($1 eq 'subject') {
124+
@subjects = split(',\s*', $2 =~ s/\[(.*)\]/$1/r);
125+
}
126+
} elsif ($line =~ /^loadMacros/) {
127+
# Parse the macros, which may be on multiple rows.
128+
my $macros = $line;
129+
while ($line && $line !~ /\);\s*$/) {
130+
$line = shift @file_contents;
131+
chomp($line);
132+
$macros .= $line;
133+
}
134+
my @all_macros = map {s/['"\s]//gr} split(/\s*,\s*/, $macros =~ s/loadMacros\((.*)\)\;$/$1/r);
135+
136+
# Only store macros that are not common.
137+
my @macros;
138+
for my $m (@all_macros) {
139+
push(@macros, $m) unless $m =~ /(PGML|PGstandard|PGcourse)/;
140+
}
141+
} elsif ($line =~ /##\s*KEYWORDS\((.*)\)/) {
142+
@kw = map {s/^'(.*)'$/$1/r} split(/,\s*/, $1);
143+
} elsif ($line =~ /^##\s*DESCRIPTION/) {
144+
$line = shift(@file_contents) =~ s/^##\s+//r;
145+
while ($line !~ /ENDDESCRIPTION/) {
146+
$description .= "$line ";
147+
$line = shift(@file_contents) =~ s/^##\s+//r;
148+
}
149+
$description =~ s/\s+$//;
150+
}
151+
}
152+
push(
153+
@search_terms,
154+
{
155+
filename => $filename,
156+
type => 'sample problem',
157+
name => $name,
158+
subjects => \@subjects,
159+
terms => \@words,
160+
keywords => \@kw,
161+
description => $description,
162+
dir => Mojo::File->new($File::Find::dir)->basename,
163+
id => $index++
164+
}
165+
);
166+
}
167+
168+
sub processLine {
169+
my ($line) = @_;
170+
my @split_line = split(/\s+/, $line);
171+
172+
my @words = ();
173+
for my $word (@split_line) {
174+
175+
# The following lines pull out some formating.
176+
$word =~ s/(PODLINK|PROBLINK)\('([\w.]+)'\)/$2/;
177+
$word =~ s/`(.*)`/$1/;
178+
$word =~ s/[.!,]$//;
179+
$word =~ s/[()\*\\\+\{\}]//g;
180+
$word = lc($word);
181+
next if $word =~ /\[|\]|\d|=/;
182+
183+
my @result = grep {/^${word}$/} @stop_words;
184+
push(@words, $word) unless @result;
185+
186+
}
187+
return @words;
188+
}
189+
190+
# Extract the text for a section from the given POD (preparsed) with a section header title
191+
sub extractPODNode {
192+
my ($root, $title) = @_;
193+
my @index = grep { ref($root->[$_]) eq 'ARRAY' && $root->[$_][2] =~ /$title/ } 0 .. scalar(@$root) - 1;
194+
if (@index == 0) {
195+
warn "The section named $title is not found in the POD.";
196+
return;
197+
}
198+
if (@index > 1) {
199+
warn "There were more than one section named $title in the POD.";
200+
return;
201+
}
202+
# start at the index 2 and extract all text
203+
my $node = $root->[ $index[0] + 1 ];
204+
my $i = 2;
205+
my $str = "";
206+
do {
207+
$str .= (ref($node->[$i]) eq 'ARRAY') ? $node->[$i][2] : $node->[$i];
208+
$i++;
209+
} while ($i < scalar(@$node));
210+
211+
return $str;
212+
}
213+
214+
sub processPODFile {
215+
my ($filename) = @_;
216+
my $parser = Pod::Simple::SimpleTree->new();
217+
my $root = $parser->parse_file("$filename")->root;
218+
219+
return {
220+
type => "macro",
221+
name => extractPODNode($root, "NAME") // '',
222+
description => [ processLine(extractPODNode($root, "DESCRIPTION") // '') ]
223+
};
224+
}
225+
226+
# Process the sample problems in $dir.
227+
228+
find({ wanted => \&processFile }, "$dir") if (grep { $build eq $_ } qw/all samples/);
229+
230+
# Process the POD within the macros dir.
231+
232+
if (grep { $build eq $_ } qw/all macros/) {
233+
my $macro_dir = Mojo::File->new("$pg_root/macros/math");
234+
my $macros = $macro_dir->list->each(sub {
235+
say "processing " . $_->basename if $verbose;
236+
my $pod_file = processPODFile($_);
237+
$pod_file->{filename} = $_->basename;
238+
$pod_file->{id} = $index++;
239+
$pod_file->{dir} = $_->dirname->to_rel("$pg_root")->to_string;
240+
push(@search_terms, $pod_file);
241+
});
242+
}
243+
244+
my $json = encode_json \@search_terms;
245+
246+
say "Writing document info to $json_file" if $verbose;
247+
$json_file->spew($json);
248+

0 commit comments

Comments
 (0)