Skip to content

Commit e4440a3

Browse files
author
Pete Smith
committed
Added datascraper utility for extracting speaker/session info
1 parent 338245b commit e4440a3

7 files changed

Lines changed: 452 additions & 1 deletion

File tree

BeyondResponsiveDesign.sln

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
33
# Visual Studio 2013
4-
VisualStudioVersion = 12.0.30501.0
4+
VisualStudioVersion = 12.0.30723.0
55
MinimumVisualStudioVersion = 10.0.40219.1
66
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BeyondResponsiveDesign.Menus", "BeyondResponsiveDesign.Menus\BeyondResponsiveDesign.Menus.csproj", "{54C61F9C-7B4B-4071-A7FB-5A9FE4D9777D}"
77
EndProject
8+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DataScraper", "DataScraper\DataScraper.csproj", "{F9BA2DCB-6095-481E-B1A8-1BC44FC075CC}"
9+
EndProject
810
Global
911
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1012
Debug|Any CPU = Debug|Any CPU
@@ -15,6 +17,10 @@ Global
1517
{54C61F9C-7B4B-4071-A7FB-5A9FE4D9777D}.Debug|Any CPU.Build.0 = Debug|Any CPU
1618
{54C61F9C-7B4B-4071-A7FB-5A9FE4D9777D}.Release|Any CPU.ActiveCfg = Release|Any CPU
1719
{54C61F9C-7B4B-4071-A7FB-5A9FE4D9777D}.Release|Any CPU.Build.0 = Release|Any CPU
20+
{F9BA2DCB-6095-481E-B1A8-1BC44FC075CC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21+
{F9BA2DCB-6095-481E-B1A8-1BC44FC075CC}.Debug|Any CPU.Build.0 = Debug|Any CPU
22+
{F9BA2DCB-6095-481E-B1A8-1BC44FC075CC}.Release|Any CPU.ActiveCfg = Release|Any CPU
23+
{F9BA2DCB-6095-481E-B1A8-1BC44FC075CC}.Release|Any CPU.Build.0 = Release|Any CPU
1824
EndGlobalSection
1925
GlobalSection(SolutionProperties) = preSolution
2026
HideSolutionNode = FALSE

DataScraper/App.config

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="utf-8" ?>
2+
<configuration>
3+
<startup>
4+
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
5+
</startup>
6+
</configuration>

DataScraper/DataScraper.csproj

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
4+
<PropertyGroup>
5+
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
6+
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
7+
<ProjectGuid>{F9BA2DCB-6095-481E-B1A8-1BC44FC075CC}</ProjectGuid>
8+
<OutputType>Exe</OutputType>
9+
<AppDesignerFolder>Properties</AppDesignerFolder>
10+
<RootNamespace>DataScraper</RootNamespace>
11+
<AssemblyName>DataScraper</AssemblyName>
12+
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
13+
<FileAlignment>512</FileAlignment>
14+
</PropertyGroup>
15+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
16+
<PlatformTarget>AnyCPU</PlatformTarget>
17+
<DebugSymbols>true</DebugSymbols>
18+
<DebugType>full</DebugType>
19+
<Optimize>false</Optimize>
20+
<OutputPath>bin\Debug\</OutputPath>
21+
<DefineConstants>DEBUG;TRACE</DefineConstants>
22+
<ErrorReport>prompt</ErrorReport>
23+
<WarningLevel>4</WarningLevel>
24+
</PropertyGroup>
25+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
26+
<PlatformTarget>AnyCPU</PlatformTarget>
27+
<DebugType>pdbonly</DebugType>
28+
<Optimize>true</Optimize>
29+
<OutputPath>bin\Release\</OutputPath>
30+
<DefineConstants>TRACE</DefineConstants>
31+
<ErrorReport>prompt</ErrorReport>
32+
<WarningLevel>4</WarningLevel>
33+
</PropertyGroup>
34+
<ItemGroup>
35+
<Reference Include="HtmlAgilityPack">
36+
<HintPath>..\packages\HtmlAgilityPack.1.4.6\lib\Net45\HtmlAgilityPack.dll</HintPath>
37+
</Reference>
38+
<Reference Include="Newtonsoft.Json, Version=6.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
39+
<SpecificVersion>False</SpecificVersion>
40+
<HintPath>..\packages\Newtonsoft.Json.6.0.4\lib\net45\Newtonsoft.Json.dll</HintPath>
41+
</Reference>
42+
<Reference Include="System" />
43+
<Reference Include="System.Core" />
44+
<Reference Include="System.Net.Http" />
45+
<Reference Include="System.Xml.Linq" />
46+
<Reference Include="System.Data.DataSetExtensions" />
47+
<Reference Include="Microsoft.CSharp" />
48+
<Reference Include="System.Data" />
49+
<Reference Include="System.Xml" />
50+
</ItemGroup>
51+
<ItemGroup>
52+
<Compile Include="Program.cs" />
53+
<Compile Include="Properties\AssemblyInfo.cs" />
54+
</ItemGroup>
55+
<ItemGroup>
56+
<None Include="App.config" />
57+
<None Include="packages.config" />
58+
</ItemGroup>
59+
<ItemGroup />
60+
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
61+
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
62+
Other similar extension points exist, see Microsoft.Common.targets.
63+
<Target Name="BeforeBuild">
64+
</Target>
65+
<Target Name="AfterBuild">
66+
</Target>
67+
-->
68+
</Project>

DataScraper/Program.cs

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
namespace DataScraper
2+
{
3+
using System;
4+
using System.Collections.Concurrent;
5+
using System.Collections.Generic;
6+
using System.IO;
7+
using System.Linq;
8+
using System.Net.Http;
9+
using System.Threading.Tasks;
10+
11+
using HtmlAgilityPack;
12+
13+
using Newtonsoft.Json;
14+
15+
public class Link
16+
{
17+
public string Icon { get; set; }
18+
19+
public string Url { get; set; }
20+
}
21+
22+
public class Session
23+
{
24+
public int Id { get; set; }
25+
26+
public string Title { get; set; }
27+
}
28+
29+
public class Speaker
30+
{
31+
public int Id { get; set; }
32+
33+
public string Name { get; set; }
34+
35+
public string Bio { get; set; }
36+
37+
public IEnumerable<Link> Links { get; set; }
38+
39+
public IEnumerable<Session> Sessions { get; set; }
40+
}
41+
42+
public class Program
43+
{
44+
public static void Main(string[] args)
45+
{
46+
const string AgendaUrl = "/Agenda";
47+
const string OutputDir =
48+
"D:\\Code\\BeyondResponsiveDesign\\BeyondResponsiveDesign.Menus\\images\\speakers";
49+
50+
var imageClient = new HttpClient();
51+
var htmlClient = new HttpClient { BaseAddress = new Uri("http://www.dddeastanglia.com") };
52+
53+
var htmlDoc = new HtmlDocument();
54+
htmlDoc.LoadHtml(htmlClient.GetStringAsync(AgendaUrl).Result);
55+
56+
var speakerLinks =
57+
htmlDoc.DocumentNode.Descendants("a")
58+
.Where(o => o.Attributes.Contains("class") && o.Attributes["class"].Value == "speakerName");
59+
60+
var speakers = new ConcurrentQueue<Speaker>();
61+
Task.WaitAll(speakerLinks.Select(link => Task.Run(async () =>
62+
{
63+
var href = link.Attributes["href"].Value;
64+
var speakerPage = new HtmlDocument();
65+
speakerPage.LoadHtml(await htmlClient.GetStringAsync(href));
66+
67+
var container =
68+
speakerPage.DocumentNode.Descendants("div")
69+
.First(
70+
o => o.Attributes.Contains("class") && o.Attributes["class"].Value.Contains("speaker"));
71+
72+
var titleElement = container.Descendants("h3").First();
73+
var linkElements = container.Descendants("section").First().Descendants("p");
74+
var paragraphElements = container.Elements("p");
75+
var sessionLinkElements = container.Element("ul").Descendants("a");
76+
77+
var id = int.Parse(href.Split('/').Last());
78+
var name = titleElement.Element("#text").InnerHtml;
79+
var bio = string.Join(string.Empty, paragraphElements.Select(o => o.InnerHtml));
80+
var links = (from linkElement in linkElements
81+
let icon = linkElement.Descendants("i").First().Attributes["class"].Value
82+
let url = linkElement.Descendants("a").First().Attributes["href"].Value
83+
select new Link { Icon = icon, Url = url }).ToList();
84+
var sessions = (from sessionLinkElement in sessionLinkElements
85+
let sessionId =
86+
int.Parse(sessionLinkElement.Attributes["href"].Value.Split('/').Last())
87+
let title = sessionLinkElement.InnerText
88+
select new Session { Id = sessionId, Title = title }).ToList();
89+
90+
speakers.Enqueue(
91+
new Speaker { Id = id, Name = name, Links = links, Bio = bio, Sessions = sessions });
92+
93+
var imageTag = titleElement.Descendants("img").First();
94+
var imageUrl = imageTag.Attributes["src"].Value.Replace("s=50", "s=300");
95+
96+
var stream = await imageClient.GetStreamAsync(imageUrl);
97+
var file = File.OpenWrite(string.Format("{0}\\{1}.jpg", OutputDir, id));
98+
await stream.CopyToAsync(file);
99+
100+
file.Close();
101+
})).ToArray());
102+
103+
var streamWriter = new StreamWriter(string.Format("{0}\\speakers.json", OutputDir));
104+
streamWriter.Write(JsonConvert.SerializeObject(speakers.OrderBy(o => o.Name)));
105+
streamWriter.Close();
106+
107+
Console.WriteLine("Process completed, press any key to exit");
108+
Console.ReadKey();
109+
}
110+
}
111+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
using System.Reflection;
2+
using System.Runtime.CompilerServices;
3+
using System.Runtime.InteropServices;
4+
5+
// General Information about an assembly is controlled through the following
6+
// set of attributes. Change these attribute values to modify the information
7+
// associated with an assembly.
8+
[assembly: AssemblyTitle("DataScraper")]
9+
[assembly: AssemblyDescription("")]
10+
[assembly: AssemblyConfiguration("")]
11+
[assembly: AssemblyCompany("")]
12+
[assembly: AssemblyProduct("DataScraper")]
13+
[assembly: AssemblyCopyright("Copyright © 2014")]
14+
[assembly: AssemblyTrademark("")]
15+
[assembly: AssemblyCulture("")]
16+
17+
// Setting ComVisible to false makes the types in this assembly not visible
18+
// to COM components. If you need to access a type in this assembly from
19+
// COM, set the ComVisible attribute to true on that type.
20+
[assembly: ComVisible(false)]
21+
22+
// The following GUID is for the ID of the typelib if this project is exposed to COM
23+
[assembly: Guid("4b1e608b-24ba-4f64-805d-7d6c1138e189")]
24+
25+
// Version information for an assembly consists of the following four values:
26+
//
27+
// Major Version
28+
// Minor Version
29+
// Build Number
30+
// Revision
31+
//
32+
// You can specify all the values or you can default the Build and Revision Numbers
33+
// by using the '*' as shown below:
34+
// [assembly: AssemblyVersion("1.0.*")]
35+
[assembly: AssemblyVersion("1.0.0.0")]
36+
[assembly: AssemblyFileVersion("1.0.0.0")]

0 commit comments

Comments
 (0)