Skip to content

Commit

Permalink
Remove archive directory if empty
Browse files Browse the repository at this point in the history
  • Loading branch information
StJudeWasHere committed Nov 13, 2024
1 parent 3cae118 commit f426a13
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 23 deletions.
7 changes: 7 additions & 0 deletions internal/archiver/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ func NewReader(waczPath string) *Reader {
}
}

// ReadArchive reads the archive and returns the contents of the warc record for
// the specified URL as a string.
func (s *Reader) ReadArchive(urlStr string) (content string) {
wacz, err := zip.OpenReader(s.waczPath)
if err != nil {
Expand Down Expand Up @@ -73,6 +75,8 @@ func (s *Reader) ReadArchive(urlStr string) (content string) {
return string(c)
}

// getCDXEntry Looks for the specified URL in the index file and returns an IndexEntry if found,
// otherwise it returns an error.
func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*IndexEntry, error) {
file, err := s.getZipFile(wacz, "indexes/index.cdx")
if err != nil {
Expand Down Expand Up @@ -118,6 +122,7 @@ func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*IndexEntry,
return &record, nil
}

// getZipFile returns a *zip.File from a wacz file. If not found it returns an error.
func (s *Reader) getZipFile(wacz *zip.ReadCloser, waczFile string) (*zip.File, error) {
for _, file := range wacz.File {
if file.Name == waczFile {
Expand All @@ -128,6 +133,8 @@ func (s *Reader) getZipFile(wacz *zip.ReadCloser, waczFile string) (*zip.File, e
return nil, errors.New("warc file file not found")
}

// searchFileSegment searches the target string in WACZ file index using bynary search.
// It loads the index contents in memory.
func (s *Reader) searchFileSegment(offset, length int64, target string) (string, error) {
file, err := os.Open(s.waczPath)
if err != nil {
Expand Down
1 change: 0 additions & 1 deletion internal/archiver/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ func (s *Writer) createIndex() error {

// addPage adds a new page record in the pages.jsonl file.
func (s *Writer) createPages() error {
// Add the pages.jsonl and add the file header
pagesWriter, err := s.waczWriter.Create("pages/pages.jsonl")
if err != nil {
return err
Expand Down
6 changes: 6 additions & 0 deletions internal/models/archive_record.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package models

type ArchiveRecord struct {
Headers string
Body string
}
22 changes: 5 additions & 17 deletions internal/routes/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ func (h *resourceHandler) indexHandler(w http.ResponseWriter, r *http.Request) {
h.Renderer.RenderTemplate(w, "resources", pageView)
}

// archiveHandler the HTTP request for the archive page. It loads the data from the
// archive and displays the source code of the crawler's response for a specific resource.
func (h *resourceHandler) archiveHandler(w http.ResponseWriter, r *http.Request) {
user, ok := h.CookieSession.GetUser(r.Context())
if !ok {
Expand Down Expand Up @@ -142,36 +144,22 @@ func (h *resourceHandler) archiveHandler(w http.ResponseWriter, r *http.Request)
return
}

archive := h.Container.ArchiveService.ReadArchive(&pv.Project, pageReportView.PageReport.URL)

var headers, body string
index := strings.Index(archive, "\r\n\r\n")
if index != -1 {
// Split the string into two parts: before and after the first newline
headers = archive[:index]
body = strings.TrimSpace(archive[index+1:])
} else {
// If there's no newline, the entire text is the first part
headers = archive
body = ""
}
record := h.Container.ArchiveService.ReadArchiveRecord(&pv.Project, pageReportView.PageReport.URL)

data := &struct {
PageReportView *models.PageReportView
ProjectView *models.ProjectView
Eid string
Ep string
Tab string
Headers string
Body string
ArchiveRecord *models.ArchiveRecord
}{
ProjectView: pv,
PageReportView: pageReportView,
Eid: eid,
Ep: ep,
Tab: tab,
Headers: headers,
Body: body,
ArchiveRecord: record,
}

pageView := &PageView{
Expand Down
36 changes: 34 additions & 2 deletions internal/services/archiver.go → internal/services/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ package services

import (
"errors"
"log"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/stjudewashere/seonaut/internal/archiver"
"github.com/stjudewashere/seonaut/internal/models"
Expand All @@ -26,11 +29,22 @@ func (s *ArchiveService) GetArchiveWriter(p *models.Project) (*archiver.Writer,
}

// ReadArchive reads an URLs WACZ record from a project's archive.
func (s *ArchiveService) ReadArchive(p *models.Project, urlStr string) string {
func (s *ArchiveService) ReadArchiveRecord(p *models.Project, urlStr string) *models.ArchiveRecord {
waczPath := s.getArchiveFile(p)
reader := archiver.NewReader(waczPath)

return reader.ReadArchive(urlStr)
content := reader.ReadArchive(urlStr)

record := &models.ArchiveRecord{}
index := strings.Index(content, "\r\n\r\n")
if index != -1 {
record.Headers = content[:index]
record.Body = strings.TrimSpace(content[index+1:])
} else {
record.Headers = content
}

return record
}

// ArchiveExists checks if a wacz file exists for the current project.
Expand All @@ -50,6 +64,24 @@ func (s *ArchiveService) DeleteArchive(p *models.Project) {

file := s.getArchiveFile(p)
os.Remove(file)

// Check if the archive directory is empty and remove it.
dir := filepath.Dir(file)
d, err := os.Open(dir)
if err != nil {
log.Printf("failed to open archive dir %s: %v", dir, err)
return
}

_, err = d.ReadDir(1)
if err == nil {
return // dir is not empty.
}

err = os.Remove(dir)
if err != nil {
log.Printf("failed to remove empty archive dir %s: %v", dir, err)
}
}

// GetArchiveFilePath returns the project's wacz file path if it exists,
Expand Down
6 changes: 3 additions & 3 deletions web/templates/archive.html
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@
<div class="content">
<h2>Headers</h2>
<p>This block shows the response headers as received by the crawler.</p>
<pre class="archive"><code>{{ .Headers }}</code></pre>
<pre class="archive"><code>{{ .ArchiveRecord.Headers }}</code></pre>

{{ if .Body }}
{{ if .ArchiveRecord.Body }}
<h2>Body</h2>
<p>This block shows the response body as received by the crawler.</p>
<pre class="archive"><code>{{ .Body }}</code></pre>
<pre class="archive"><code>{{ .ArchiveRecord.Body }}</code></pre>
{{ end }}
</div>
</div>
Expand Down

0 comments on commit f426a13

Please sign in to comment.