Skip to content

Commit d2922fe

Browse files
committed
don’t keep query results in memory, write them directly to disk
For queries that fit in memory, the page cache will give us the necessary performance. Queries that don’t fit in memory are enabled by this commit :).
1 parent fdae522 commit d2922fe

File tree

2 files changed

+183
-50
lines changed

2 files changed

+183
-50
lines changed

cmd/dcs-web/querymanager.go

Lines changed: 144 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ import (
88
"github.com/Debian/dcs/cmd/dcs-web/common"
99
"github.com/Debian/dcs/cmd/dcs-web/search"
1010
dcsregexp "github.com/Debian/dcs/regexp"
11+
"github.com/Debian/dcs/stringpool"
1112
"github.com/influxdb/influxdb-go"
13+
"hash/fnv"
1214
"io"
1315
"log"
1416
"math"
@@ -115,6 +117,38 @@ func (s ByRanking) Swap(i, j int) {
115117
s[i], s[j] = s[j], s[i]
116118
}
117119

120+
type resultPointer struct {
121+
backendidx int
122+
ranking float32
123+
offset int64
124+
length int64
125+
126+
// Used as a tie-breaker when sorting by ranking to guarantee stable
127+
// results, independent of the order in which the results are returned from
128+
// source backends.
129+
pathHash uint64
130+
131+
// Used for per-package results. Points into a stringpool.StringPool
132+
packageName *string
133+
}
134+
135+
type pointerByRanking []resultPointer
136+
137+
func (s pointerByRanking) Len() int {
138+
return len(s)
139+
}
140+
141+
func (s pointerByRanking) Less(i, j int) bool {
142+
if s[i].ranking == s[j].ranking {
143+
return s[i].pathHash > s[j].pathHash
144+
}
145+
return s[i].ranking > s[j].ranking
146+
}
147+
148+
func (s pointerByRanking) Swap(i, j int) {
149+
s[i], s[j] = s[j], s[i]
150+
}
151+
118152
type queryState struct {
119153
started time.Time
120154
events []event
@@ -132,9 +166,12 @@ type queryState struct {
132166
resultPages int
133167
numResults int
134168

135-
// TODO: this will be deleted once we write everything directly to disk
136-
allResults []Result
137-
allResultsMu *sync.Mutex
169+
// One file per backend, containing JSON-serialized results. When writing,
170+
// we keep the offsets, so that we can later sort the pointers and write
171+
// the resulting files.
172+
tempFiles []*os.File
173+
packagePool *stringpool.StringPool
174+
resultPointers []resultPointer
138175

139176
allPackages map[string]bool
140177
allPackagesSorted []string
@@ -198,7 +235,7 @@ func queryBackend(queryid string, backend string, backendidx int, query string)
198235
}
199236
}
200237
if r.Type == "result" {
201-
storeResult(queryid, r)
238+
storeResult(queryid, backendidx, r)
202239
} else if r.Type == "progress" {
203240
storeProgress(queryid, backendidx, r)
204241
}
@@ -214,7 +251,7 @@ func maybeStartQuery(queryid, src, query string) bool {
214251
// XXX: Starting a new query while there may still be clients reading that
215252
// query is not a great idea. Best fix may be to make getEvent() use a
216253
// querystate instead of the string identifier.
217-
if !running || time.Since(querystate.started) > 15*time.Minute {
254+
if !running || time.Since(querystate.started) > 30*time.Minute {
218255
backends := strings.Split(*common.SourceBackends, ",")
219256
state[queryid] = queryState{
220257
started: time.Now(),
@@ -224,13 +261,32 @@ func maybeStartQuery(queryid, src, query string) bool {
224261
filesTotal: make([]int, len(backends)),
225262
filesProcessed: make([]int, len(backends)),
226263
filesMu: &sync.Mutex{},
227-
allResults: make([]Result, 0),
228-
allResultsMu: &sync.Mutex{},
264+
tempFiles: make([]*os.File, len(backends)),
229265
allPackages: make(map[string]bool),
230266
allPackagesMu: &sync.Mutex{},
267+
packagePool: stringpool.NewStringPool(),
268+
}
269+
270+
var err error
271+
dir := filepath.Join(*queryResultsPath, queryid)
272+
if err := os.MkdirAll(dir, os.FileMode(0755)); err != nil {
273+
// TODO: mark the query as failed
274+
log.Printf("[%s] could not create %q: %v\n", queryid, dir, err)
275+
return false
231276
}
277+
278+
// TODO: it’d be so much better if we would correctly handle ESPACE errors
279+
// in the code below (and above), but for that we need to carefully test it.
280+
ensureEnoughSpaceAvailable()
281+
232282
for i := 0; i < len(backends); i++ {
233283
state[queryid].filesTotal[i] = -1
284+
path := filepath.Join(dir, fmt.Sprintf("unsorted_%d.json", i))
285+
state[queryid].tempFiles[i], err = os.Create(path)
286+
if err != nil {
287+
log.Printf("[%s] could not create %q: %v\n", queryid, path, err)
288+
// TODO: mark query as failed
289+
}
234290
}
235291
log.Printf("initial results = %v\n", state[queryid])
236292
for idx, backend := range backends {
@@ -260,7 +316,7 @@ func sendPaginationUpdate(queryid string, s queryState) {
260316
}
261317
}
262318

263-
func storeResult(queryid string, result Result) {
319+
func storeResult(queryid string, backendidx int, result Result) {
264320
result.Type = "result"
265321

266322
result.Package = result.Path[:strings.Index(result.Path, "_")]
@@ -269,8 +325,6 @@ func storeResult(queryid string, result Result) {
269325
// for the top 10 at all.
270326
s := state[queryid]
271327

272-
log.Printf("[%s] (currently %d) result %v\n", queryid, len(s.allResults), result)
273-
274328
if s.FirstPathRank > 0 {
275329
// Now store the combined ranking of PathRanking (pre) and Ranking (post).
276330
// We add the values because they are both percentages.
@@ -302,24 +356,51 @@ func storeResult(queryid string, result Result) {
302356
addEventMarshal(queryid, &result)
303357
}
304358

305-
// TODO: as a first POC, keep all results in memory, sort them, write them out to files.
359+
tmpOffset, err := state[queryid].tempFiles[backendidx].Seek(0, os.SEEK_CUR)
360+
if err != nil {
361+
log.Printf("[%s] could not seek: %v\n", queryid, err)
362+
// TODO: mark query as failed
363+
return
364+
}
365+
366+
if err := json.NewEncoder(s.tempFiles[backendidx]).Encode(result); err != nil {
367+
log.Printf("[%s] could not write %v: %v\n", queryid, result, err)
368+
// TODO: mark query as failed
369+
}
370+
371+
offsetAfterWriting, err := state[queryid].tempFiles[backendidx].Seek(0, os.SEEK_CUR)
372+
if err != nil {
373+
log.Printf("[%s] could not seek: %v\n", queryid, err)
374+
// TODO: mark query as failed
375+
return
376+
}
377+
378+
h := fnv.New64()
379+
io.WriteString(h, result.Path)
380+
306381
stateMu.Lock()
307382
s = state[queryid]
308-
s.allResults = append(s.allResults, result)
383+
s.resultPointers = append(s.resultPointers, resultPointer{
384+
backendidx: backendidx,
385+
ranking: result.Ranking,
386+
offset: tmpOffset,
387+
length: offsetAfterWriting - tmpOffset,
388+
pathHash: h.Sum64(),
389+
packageName: s.packagePool.Get(result.Package)})
309390
s.allPackages[result.Package] = true
310391
s.numResults++
311392
state[queryid] = s
312393
stateMu.Unlock()
313-
314-
// TODO: write the result to disk, no matter what
315-
// TODO: eventually, we’ll want to write it to unsorted.json and sort it afterwards. we could do that by reading through the file, storing (ranking, file_offset) tuples, sorting them, then writing out the sorted files. note that we can even store the (ranking, file_offset) tuples at the time when the results come in.
316394
}
317395

318396
func finishQuery(queryid string) {
319397
log.Printf("[%s] done, closing all client channels.\n", queryid)
320398
stateMu.Lock()
321399
s := state[queryid]
322400
s.done = true
401+
for _, f := range s.tempFiles {
402+
f.Close()
403+
}
323404
state[queryid] = s
324405
stateMu.Unlock()
325406
addEvent(queryid, []byte{}, nil)
@@ -414,18 +495,48 @@ func ensureEnoughSpaceAvailable() {
414495
}
415496
}
416497

498+
func createFromPointers(queryid string, name string, pointers []resultPointer) error {
499+
log.Printf("[%s] writing %q\n", queryid, name)
500+
f, err := os.Create(name)
501+
if err != nil {
502+
return err
503+
}
504+
defer f.Close()
505+
if _, err := f.Write([]byte("[")); err != nil {
506+
return err
507+
}
508+
for idx, pointer := range pointers {
509+
src := state[queryid].tempFiles[pointer.backendidx]
510+
if _, err := src.Seek(pointer.offset, os.SEEK_SET); err != nil {
511+
return err
512+
}
513+
if idx > 0 {
514+
if _, err := f.Write([]byte(",")); err != nil {
515+
return err
516+
}
517+
}
518+
if _, err := io.CopyN(f, src, pointer.length); err != nil {
519+
return err
520+
}
521+
}
522+
if _, err := f.Write([]byte("]\n")); err != nil {
523+
return err
524+
}
525+
return nil
526+
}
527+
417528
func writeToDisk(queryid string) {
418529
// Get the slice with results and unset it on the state so that processing can continue.
419530
stateMu.Lock()
420531
s := state[queryid]
421-
results := s.allResults
422-
if len(results) == 0 {
532+
pointers := s.resultPointers
533+
if len(pointers) == 0 {
423534
log.Printf("[%s] not writing, no results.\n", queryid)
424535
stateMu.Unlock()
425536
finishQuery(queryid)
426537
return
427538
}
428-
s.allResults = make([]Result, 0)
539+
s.resultPointers = nil
429540
idx := 0
430541
packages := make([]string, len(s.allPackages))
431542
// TODO: sort by ranking as soon as we store the best ranking with each package. (at the moment it’s first result, first stored)
@@ -437,10 +548,10 @@ func writeToDisk(queryid string) {
437548
state[queryid] = s
438549
stateMu.Unlock()
439550

440-
log.Printf("[%s] writing, %d results.\n", queryid, len(results))
551+
log.Printf("[%s] writing, %d results.\n", queryid, len(pointers))
441552
log.Printf("[%s] packages: %v\n", queryid, packages)
442553

443-
sort.Sort(ByRanking(results))
554+
sort.Sort(pointerByRanking(pointers))
444555

445556
resultsPerPage := 10
446557
dir := filepath.Join(*queryResultsPath, queryid)
@@ -466,57 +577,40 @@ func writeToDisk(queryid string) {
466577
}
467578
f.Close()
468579

469-
pages := int(math.Ceil(float64(len(results)) / float64(resultsPerPage)))
580+
pages := int(math.Ceil(float64(len(pointers)) / float64(resultsPerPage)))
470581
for page := 0; page < pages; page++ {
471582
start := page * resultsPerPage
472583
end := (page + 1) * resultsPerPage
473-
if end > len(results) {
474-
end = len(results)
584+
if end > len(pointers) {
585+
end = len(pointers)
475586
}
587+
476588
name := filepath.Join(dir, fmt.Sprintf("page_%d.json", page))
477-
log.Printf("[%s] writing %q\n", queryid, name)
478-
f, err := os.Create(name)
479-
if err != nil {
480-
log.Printf("[%s] could not create %q: %v\n", queryid, f, err)
589+
if err := createFromPointers(queryid, name, pointers[start:end]); err != nil {
590+
log.Printf("[%s] could not create %q from pointers: %v\n", queryid, name, err)
481591
// TODO: mark query as failed
482592
return
483593
}
484-
encoder := json.NewEncoder(f)
485-
if err := encoder.Encode(results[start:end]); err != nil {
486-
log.Printf("[%s] could not write %v: %v\n", queryid, results[start:end], err)
487-
// TODO: mark query as failed
488-
return
489-
}
490-
// We don’t use defer f.Close() because that would only be executed once the function returns.
491-
f.Close()
492594
}
493595

494596
// Now save the results into their package-specific files.
495-
bypkg := make(map[string][]Result)
496-
for _, result := range results {
497-
pkgresults := bypkg[result.Package]
597+
bypkg := make(map[string][]resultPointer)
598+
for _, pointer := range pointers {
599+
pkgresults := bypkg[*pointer.packageName]
498600
if len(pkgresults) >= resultsPerPackage {
499601
continue
500602
}
501-
pkgresults = append(pkgresults, result)
502-
bypkg[result.Package] = pkgresults
603+
pkgresults = append(pkgresults, pointer)
604+
bypkg[*pointer.packageName] = pkgresults
503605
}
504606

505607
for pkg, pkgresults := range bypkg {
506608
name := filepath.Join(dir, fmt.Sprintf("pkg_%s.json", pkg))
507-
log.Printf("[%s] writing %q\n", queryid, name)
508-
f, err := os.Create(name)
509-
if err != nil {
510-
log.Printf("[%s] could not create %q: %v\n", queryid, f, err)
609+
if err := createFromPointers(queryid, name, pkgresults); err != nil {
610+
log.Printf("[%s] could not create %q from pointers: %v\n", queryid, name, err)
511611
// TODO: mark query as failed
512612
return
513613
}
514-
if err := json.NewEncoder(f).Encode(pkgresults); err != nil {
515-
log.Printf("[%s] could not write results: %v\n", queryid, err)
516-
// TODO: mark query as failed
517-
return
518-
}
519-
f.Close()
520614
}
521615

522616
stateMu.Lock()

stringpool/stringpool.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// stringpool provides a pool of string pointers, ensuring that each string is
2+
// stored only once in memory. This is useful for queries that have many
3+
// results, as the amount of source packages is limited. So, as soon as
4+
// len(results) > len(sourcepackages), you save memory using a stringpool.
5+
package stringpool
6+
7+
import (
8+
"sync"
9+
)
10+
11+
type StringPool struct {
12+
sync.RWMutex
13+
strings map[string]*string
14+
}
15+
16+
func NewStringPool() *StringPool {
17+
return &StringPool{
18+
strings: make(map[string]*string)}
19+
}
20+
21+
func (pool *StringPool) Get(s string) *string {
22+
// Check if the entry is already in the pool with a slightly cheaper
23+
// (read-only) mutex.
24+
pool.RLock()
25+
stored, ok := pool.strings[s]
26+
pool.RUnlock()
27+
if ok {
28+
return stored
29+
}
30+
31+
pool.Lock()
32+
defer pool.Unlock()
33+
stored, ok = pool.strings[s]
34+
if ok {
35+
return stored
36+
}
37+
pool.strings[s] = &s
38+
return &s
39+
}

0 commit comments

Comments
 (0)