diff --git a/Crawler.Simple/Program.cs b/Crawler.Simple/Program.cs index 9ab7532..27edc22 100644 --- a/Crawler.Simple/Program.cs +++ b/Crawler.Simple/Program.cs @@ -30,8 +30,8 @@ private static void Main(string[] args) //SimpleCrawler.CnBlogsCrawler().Run(); //Console.ReadKey(); - //SimpleCrawler.RedisCnblogsCrawler().Run(); - //Console.ReadKey(); + SimpleCrawler.RedisCnblogsCrawler().Run(); + Console.ReadKey(); } } diff --git a/Crawler.Simple/SimpleCrawler.cs b/Crawler.Simple/SimpleCrawler.cs index 6874a04..1e7031b 100644 --- a/Crawler.Simple/SimpleCrawler.cs +++ b/Crawler.Simple/SimpleCrawler.cs @@ -53,7 +53,6 @@ public static Crawler CnieltsSpider() DownloadDirectory = @"E:\学习资料\English\新概念第二册\" }) .UseMultiThread(1) - .SetLogFactory(new NLoggerFactory()) .UseNamed("CnieltsSpider"); return CrawlerBuilder.Current.Builder(); } @@ -72,11 +71,10 @@ public static Crawler CnieltsV2Spider() .UsePipeline(new PipelineOptions()) .UsePipeline(new FileDownloadOptions() { - DownloadDirectory = @"~/CnieltsV2Spider/", + DownloadDirectory = @"~/CnieltsV2Spider/2017-12-10/", Downloader = new HttpDownloader() }) .UseMultiThread(3) - .SetLogFactory(new NLoggerFactory()) .UseParallelMode() .UseNamed("CnieltsV2Spider"); return CrawlerBuilder.Current.Builder(); @@ -99,7 +97,6 @@ public static ICrawler UrlFinderPipeline() Schedulers.SchedulerManager.GetScheduler>("UrlFinderPipeline") }) .UseMultiThread(10) - .SetLogFactory(new NLoggerFactory()) //.UseBloomFilter(int.MaxValue, 0.001F) .UseRedisBloomFilter() .UseNamed("UrlFinderPipeline"); @@ -113,17 +110,15 @@ public static ICrawler CrawlerFullSite() CrawlerBuilder.Current.ClearPipelines(); CrawlerBuilder.Current.ClearSites(); CrawlerBuilder.Current - .AddSite("http://cuiqingcai.com/") + .AddSite("https://www.yezismile.com") .UsePipeline(new UrlFinderOptons() { WaitForComplete = 10000, - UrlValidator = url => url.Contains("cuiqingcai.com"), + UrlValidator = url => url.Contains("yezismile.com"), Sleep = 500 }) - .UsePipeline(new FileDownloadOptions("~/Cuiqingcai/")) - .UseMultiThread(5) - .SetLogFactory(new NLoggerFactory()) - .UseBloomFilter(int.MaxValue, 0.001F) + .UsePipeline(new FileDownloadOptions("~/Yezismile/HtmlSources/")) + .UseMultiThread(8) .UseNamed("CrawlerFullSite"); return CrawlerBuilder.Current.Builder(); } @@ -161,8 +156,7 @@ public static ICrawler CnBlogsCrawler() WaitForComplete = waitForComplete, Cookie = cookie }) - .SetLogFactory(new NLoggerFactory()) - .UseBloomFilter(int.MaxValue, 0.001F) + .UseBloomFilter(int.MaxValue / 21, 0.001F) .UseMultiThread(5) .UseParallelMode(); @@ -202,8 +196,7 @@ public static ICrawler RedisCnblogsCrawler() WaitForComplete = waitForComplete, Cookie = cookie }) - .SetLogFactory(new NLoggerFactory()) - .UseMultiThread(5) + .UseMultiThread(1) .UseRedisBloomFilter() .UseParallelMode(); diff --git a/Crawler/Filter/UrlFilterManager.cs b/Crawler/Filter/UrlFilterManager.cs index 8160148..f7712fe 100644 --- a/Crawler/Filter/UrlFilterManager.cs +++ b/Crawler/Filter/UrlFilterManager.cs @@ -16,6 +16,6 @@ public static void SetUrlFilter(Func func) _urlFilter = func(); } - public static IUrlFilter Current => _urlFilter; + public static IUrlFilter Current => _urlFilter ?? (_urlFilter = new BloomFilter(int.MaxValue / 21, 0.001F)); } } \ No newline at end of file diff --git a/Crawler/Logger/LoggerManager.cs b/Crawler/Logger/LoggerManager.cs index cf066d6..c504aa3 100644 --- a/Crawler/Logger/LoggerManager.cs +++ b/Crawler/Logger/LoggerManager.cs @@ -10,6 +10,8 @@ public static class LoggerManager { private static ILoggerFactory _loggerFactory; + public static ILoggerFactory CurrentFactory => _loggerFactory ?? (_loggerFactory = new NLoggerFactory()); + public static void SetLogFactory(ILoggerFactory factory) { _loggerFactory = factory ?? throw new ArgumentNullException(nameof(factory)); @@ -22,7 +24,7 @@ public static ILogger GetLogger(string name) throw new ArgumentNullException(nameof(name)); } - return _loggerFactory.Create(name); + return CurrentFactory.Create(name); } public static ILogger GetLogger(Type type) @@ -31,12 +33,12 @@ public static ILogger GetLogger(Type type) { throw new ArgumentNullException(nameof(type)); } - return _loggerFactory.Create(type); + return CurrentFactory.Create(type); } public static ILogger GetLogger() { - return _loggerFactory.Create(); + return CurrentFactory.Create(); } } } diff --git a/Crawler/Schedulers/RedisScheduler.cs b/Crawler/Schedulers/RedisScheduler.cs index c7062d3..6dc9521 100644 --- a/Crawler/Schedulers/RedisScheduler.cs +++ b/Crawler/Schedulers/RedisScheduler.cs @@ -14,7 +14,6 @@ namespace Crawler.Schedulers public class RedisScheduler : IScheduler { private readonly string _redisSchedulerKey; - private readonly IUrlFilter _urlFilter; private readonly ReaderWriterLockSlim _lock = new ReaderWriterLockSlim(); private readonly IDatabase _database; private long _totalCount; @@ -32,7 +31,6 @@ public RedisScheduler(string name, string connectionString) _redisSchedulerKey = $"Crawler.Schedulers.RedisScheduler.{name}"; _database = ConnectionMultiplexer.Connect(connectionString).GetDatabase(); - _urlFilter = UrlFilterManager.Current; } object IScheduler.Pop() @@ -86,10 +84,10 @@ void IScheduler.Push(object @object) public virtual void Push(T requestSite) { - if (_urlFilter == null || !_urlFilter.Contains(_redisSchedulerKey + requestSite)) + if (UrlFilterManager.Current == null || !UrlFilterManager.Current.Contains(_redisSchedulerKey + requestSite)) { _database.ListLeftPush(_redisSchedulerKey, Serialize(requestSite)); - _urlFilter?.Add(_redisSchedulerKey + requestSite); + UrlFilterManager.Current?.Add(_redisSchedulerKey + requestSite); _totalCount++; } } diff --git a/Crawler/Schedulers/Scheduler.cs b/Crawler/Schedulers/Scheduler.cs index 0ff0b20..c3640d3 100644 --- a/Crawler/Schedulers/Scheduler.cs +++ b/Crawler/Schedulers/Scheduler.cs @@ -7,7 +7,6 @@ namespace Crawler.Schedulers { public class Scheduler : IScheduler { - private readonly IUrlFilter _urlFilter = UrlFilterManager.Current; private readonly ReaderWriterLockSlim _lock = new ReaderWriterLockSlim(); private readonly List _stack = new List(); private long _totalCount = 0; @@ -78,10 +77,10 @@ public long Count public virtual void Push(T requestSite) { - if (_urlFilter == null || !_urlFilter.Contains(requestSite.ToString())) + if (UrlFilterManager.Current == null || !UrlFilterManager.Current.Contains(requestSite.ToString())) { _stack.Add(requestSite); - _urlFilter?.Add(requestSite.ToString()); + UrlFilterManager.Current?.Add(requestSite.ToString()); _totalCount++; } } diff --git a/Crawler/Schedulers/SchedulerManager.cs b/Crawler/Schedulers/SchedulerManager.cs index d1ed41b..8583959 100644 --- a/Crawler/Schedulers/SchedulerManager.cs +++ b/Crawler/Schedulers/SchedulerManager.cs @@ -65,7 +65,8 @@ internal static IScheduler InternalGetScheduler(Type schedulerType, string key) { if (!schedules.TryGetValue(key, out scheduler)) { - scheduler = RedisSchedulerFactory(schedulerType, key) ?? (IScheduler) Activator.CreateInstance(schedulerType); + scheduler = RedisSchedulerFactory(schedulerType, key) ?? + (IScheduler) Activator.CreateInstance(schedulerType); schedules.Add(key, scheduler); } } @@ -75,7 +76,8 @@ internal static IScheduler InternalGetScheduler(Type schedulerType, string key) private static IScheduler RedisSchedulerFactory(Type schedulerType, string key) { - if (schedulerType.GetGenericTypeDefinition() == typeof(RedisScheduler<>)) + if (schedulerType.IsGenericType && + schedulerType.GetGenericTypeDefinition() == typeof(RedisScheduler<>)) { var constructors = schedulerType.GetConstructors(); @@ -87,7 +89,7 @@ private static IScheduler RedisSchedulerFactory(Type schedulerType, string key) var parameterTypes = parameters.Select(p => p.ParameterType).ToArray(); if (parameterTypes.Length != 2) continue; if (parameterTypes.Any(x => x != typeof(string))) continue; - return (IScheduler)constructor.Invoke(new object[] { schedulerKey, connectionString}); + return (IScheduler) constructor.Invoke(new object[] {schedulerKey, connectionString}); } } return null;