-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.hs
218 lines (179 loc) · 6.1 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Prelude hiding (log)
import Data.Monoid ((<>))
import Options.Applicative
import GHC.Conc (getNumProcessors)
import Control.Concurrent
import System.Directory (createDirectoryIfMissing)
import System.FilePath
import qualified Data.Acid as A
import qualified Data.Text as T
import HsBooru.Scraper
import HsBooru.Sites
import HsBooru.Stats
import HsBooru.Types hiding (update)
import HsBooru.Util
import HsBooru.Xapian
data GlobalConf = Conf
{ dbDir :: FilePath
, optImageDir :: Maybe FilePath
, batchSize :: Int
, parCount :: Int
, optCapCount :: Maybe Int
, retryCount :: Int
, minTagCount :: Int
, blackList :: [Text]
, whiteList :: [Text]
, verbose :: Bool
}
type Command = GlobalConf -> IO ()
siteOpt :: Parser SiteScraper
siteOpt = argument readSite $ metavar "SITE" <> help "booru name"
where readSite = maybeReader findSite <|> readerError msg
msg = "Invalid site name. Currently supported sites: "
++ unwords (map siteName scrapers)
siteNameOpt :: Parser String
siteNameOpt = siteName <$> siteOpt
runScraper :: [SiteScraper] -> GlobalConf -> InternalDB -> IO ()
runScraper sites Conf{..} acidDB = do
-- Generate context
let xapianDir = dbDir </> "xapian"
imageDir = Just $ fromMaybe (dbDir </> "images") optImageDir
forM_ imageDir $ createDirectoryIfMissing True
xapianDB <- either error id <$> localDB xapianDir
-- Spawn enough threads to make each capability have about ~parCount open
-- connections simultaneously
capCount <- case optCapCount of
Nothing -> min 4 <$> getNumProcessors
Just jobs -> jobs <$ setNumCapabilities jobs
let threadCount = capCount * parCount
fetchURL <- fetchHTTP $ threadCount + 10
-- Run scraper
forM_ sites $ \site@SiteScraper{..} -> do
res <- runBooruM Ctx{..} $ processSite site
either (logError siteName . show) return res
A.createArchive acidDB
log "general" "Done scraping."
-- ** `scrape` command
scrape :: [SiteScraper] -> Command
scrape sites c@Conf{..} = withAcid dbDir $ runScraper sites c
scrapeCmd :: Mod CommandFields Command
scrapeCmd = command "scrape" . info (scrape <$> some siteOpt) $
progDesc "Scrape posts from websites"
-- ** `update` command
update :: Command
update c@Conf{..} = withAcid dbDir $ \acidDB -> do
sites <- A.query acidDB ActiveSites
runScraper [ ss | ss <- scrapers, siteName ss `elem` sites ] c acidDB
updateCmd :: Mod CommandFields Command
updateCmd = command "update" . info (pure update) $
progDesc "Update all previously scraped websites"
-- ** `retry` command
retrySite :: [String] -> Command
retrySite ss Conf{..} = withAcid dbDir $ \acidDB ->
forM_ ss $ A.update acidDB . RetrySite
retryCmd :: Mod CommandFields Command
retryCmd = command "retry" . info (retrySite <$> some siteNameOpt) $
progDesc "Reset the deleted post database for named sites"
-- ** `info` command
siteInfo :: String -> Command
siteInfo site Conf{..} = withAcid dbDir $ \acidDB -> do
ss <- A.query acidDB (GetSite site)
putStrLn $ "Stats for site `" ++ site ++ "`:\n"
printSiteStats ss
infoCmd :: Mod CommandFields Command
infoCmd = command "info" . info (siteInfo <$> siteNameOpt) $
progDesc "Show some statistics about a named site"
-- * Main
opts :: ParserInfo (IO ())
opts = info (liftA2 ($) parseCmd parseGlobalOpts <**> helper) $ fullDesc
<> header "hsbooru - a haskell *booru scraper using xapian"
where parseCmd = hsubparser $ scrapeCmd <> updateCmd <> retryCmd <> infoCmd
main :: IO ()
main = join $ customExecParser parserOpts opts
where parserOpts = prefs $ showHelpOnError
<> showHelpOnEmpty
<> multiSuffix ".."
<> columns 100
-- ** Global option boilerplate
parseGlobalOpts :: Parser GlobalConf
parseGlobalOpts = Conf
<$> strOption
( long "dbDir"
<> short 'd'
<> metavar "DIR"
<> help "Database directory"
)
<*> (optional.strOption)
( long "imageDir"
<> short 'i'
<> metavar "DIR"
<> help "Directory to store images in. Defaults to `<dbDir>/images`."
)
<*> option auto
( long "batchSize"
<> short 'B'
<> metavar "N"
<> showDefault
<> value 1000
<> help "How many posts to fetch before committing them all to the \
\database. Since this is a synchronous operation, using a lower \
\value reduces throughput; but using a too high value can \
\create long stalls in the mailbox."
)
<*> option auto
( long "parallelism"
<> short 'p'
<> metavar "N"
<> showDefault
<> value 2
<> help "How many in-flight requests to maintain per thread. Increasing \
\this can improve throughput but going too high risks running \
\into network errors as the site kills connections."
)
<*> (optional.option auto)
( long "jobs"
<> short 'j'
<> metavar "N"
<> help "How many threads to scrape from in parallel. Defaults to the \
\of detected CPU cores, but no more than 4. Going too high can \
\be slower, if the server decides to rate limit."
)
<*> option auto
( long "retryCount"
<> short 'r'
<> metavar "N"
<> showDefault
<> value 3
<> help "How often to retry each network request before giving up."
)
<*> option auto
( long "minTags"
<> short 'm'
<> metavar "N"
<> showDefault
<> value 0
<> help "Skip posts with fewer tags than this. They will be retried \
\automatically"
)
<*> (fmap splitTags . many . strOption)
( long "blackList"
<> short 'b'
<> metavar "TAG"
<> help "Delete posts with any of these tags."
)
<*> (fmap splitTags . many . strOption)
( long "whiteList"
<> short 'w'
<> metavar "TAG"
<> help "Delete posts without at least one of these tags."
)
<*> switch
( long "verbose"
<> short 'v'
<> showDefault
<> help "Print data about every URL and post. Can be slow!"
)
where splitTags :: [String] -> [Text]
splitTags = concatMap $ T.splitOn "," . T.pack