diff --git a/.gitignore b/.gitignore index b4ca71e..ef2362d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.DS_Store full/ lite/ +release/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b059c7..e3ceac3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,27 +1,56 @@ # CHANGELOG.md +## 1.2.0 (2021-07-30) + +**New:** + + - [#36](https://github.com/unsplash/datasets/issues/36): Included [BlurHash](https://blurha.sh/) of each photo in the dataset + +**Fix:** + + - [#39](https://github.com/unsplash/datasets/issues/39): Fixed AI confidence for AI keywords + +**Data:** + + - Added more search conversions + - Historical data for search conversions will now be limited to 1 year before each version's release date. + - Added ~1M photos to the Full Dataset. + - Replaced 794 dead photos (removed from Unsplash) in the Lite Dataset with approved photos + +**Lite dataset link:** + + - Version link: [Version 1.2.0](https://unsplash.com/data/lite/1.2.0) + +**Integrity checks (SHA-256):** + + - Lite: `461fa4a1796b7966fc3aa904ce2e7f18890323243ed0e95f47c7042b335fcd98` + - Full: `daa99dab8ba7a47d530356311ffa73f17eb403898a75399c54812e9dd582f8af` + ## 1.1.0 (2020-09-23) **New:** -- [#10](https://github.com/unsplash/datasets/issues/10): Included user descriptions of the photos -- [#21](https://github.com/unsplash/datasets/issues/21): Included width, height and aspect ratio of photos -- [#22](https://github.com/unsplash/datasets/issues/22): Included colors data from photos, coming from a 3rd party AI + + - [#10](https://github.com/unsplash/datasets/issues/10): Included user descriptions of the photos + - [#21](https://github.com/unsplash/datasets/issues/21): Included width, height and aspect ratio of photos + - [#22](https://github.com/unsplash/datasets/issues/22): Included colors data from photos, coming from a 3rd party AI **Fix:** -- [#13](https://github.com/unsplash/datasets/issues/13): Trimmed [some fields](https://github.com/unsplash/datasets/issues/13#issuecomment-674709294). -- [#29](https://github.com/unsplash/datasets/issues/29): Replaced newlines in keywords file by spaces to avoid CSV importation issues + - [#13](https://github.com/unsplash/datasets/issues/13): Trimmed [some fields](https://github.com/unsplash/datasets/issues/13#issuecomment-674709294). + - [#29](https://github.com/unsplash/datasets/issues/29): Replaced newlines in keywords file by spaces to avoid CSV importation issues **Data:** -- Replaced 307 deleted photos in the Lite dataset with new approved photos -- Removed about 17k deleted photos in the Full dataset -- Updated conversions data with latest conversions. Full dataset now weighs ~25GB (vs. 16GB) + + - Replaced 307 deleted photos in the Lite dataset with new approved photos + - Removed about 17k deleted photos in the Full dataset + - Updated conversions data with latest conversions. Full dataset now weighs ~25GB (vs. 16GB) **Lite dataset link:** - Version link: [Version 1.1.0](https://unsplash.com/data/lite/1.1.0) **Integrity checks (SHA-256):** + - Lite: `266e45a8658ab2456779b3376b109e435e595646126846603f2efee5b47ee526` - Full: `19abc3494bda06e36e61ccabf4dd2ca8e046ac50a5e4e3570cc8aa89ed6a9713` @@ -39,5 +68,6 @@ - Lite dataset links now follow the pattern `https://unsplash.com/data/lite/{version}` **Integrity checks (SHA-256):** + - Lite: `aa199951dd8756563f7ffef4abbc2d20c845bcff62241ae677af523728819d60` - Full: `ee47f7542e5ef260e6b904046b4837532f420412a0e2c299dcecab55acd28d1f` diff --git a/DOCS.md b/DOCS.md index 4050ab6..973063f 100644 --- a/DOCS.md +++ b/DOCS.md @@ -38,6 +38,7 @@ The `photos.tsv` dataset has one row per photo. It contains properties of the ph | ai_primary_landmark_latitude | Latitude of the landmark, generated by a 3rd party AI | | ai_primary_landmark_longitude | Longitude of the landmark, generated by a 3rd party AI | | ai_primary_landmark_confidence | Landmark confidence of the 3rd party AI | +| blur_hash | [BlurHash](https://blurha.sh/) hash of the photo | ## 2 - keywords.tsv @@ -48,9 +49,9 @@ about how a keyword is connected to a photo and the conversions of the photo our |-------------------------------|-------------| | photo_id | ID of the Unsplash photo | | keyword | Keyword or search term | -| ai_service_1_confidence | Confidence for the keyword from a 3rd party AI | -| ai_service_2_confidence | Confidence for the keyword from another 3rd party AI | -| suggested_by_user | Whether the keyword was added by a user (a human) | +| ai_service_1_confidence | Confidence for the keyword from a 3rd party AI (0-100)| +| ai_service_2_confidence | Confidence for the keyword from another 3rd party AI (0-100)| +| suggested_by_user | Whether the keyword was added by a user (human) | ## 3 - collections.tsv @@ -71,7 +72,7 @@ when the photo was added to the collection and gives the title of the collection *Note: a conversion is currently defined as a user selecting an image to download it.* -The `conversions.tsv` dataset has one row per search conversion. The dataset tells you which photo has been downloaded for a search, the country of origin, and an anonymous identifier to indiciate the unique users. +The `conversions.tsv` dataset has one row per search conversion. The dataset tells you which photo has been downloaded for a search, the country of origin, and an anonymous identifier to indiciate the unique users. The data goes back up to 1 year before the release of each version of the dataset. | Field | Description | |-------------------------------|-------------| diff --git a/README.md b/README.md index 36cc807..3eaf67e 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@ ![](https://unsplash.com/blog/content/images/2020/08/dataheader.jpg) -The Unsplash Dataset is made up of over 200,000+ contributing global photographers and data sourced from hundreds of millions of searches across a nearly unlimited number of uses and contexts. Due to the breadth of intent and semantics contained within the Unsplash dataset, it enables new opportunities for research and learning. +The Unsplash Dataset is made up of over 250,000+ contributing global photographers and data sourced from hundreds of millions of searches across a nearly unlimited number of uses and contexts. Due to the breadth of intent and semantics contained within the Unsplash dataset, it enables new opportunities for research and learning. The Unsplash Dataset is offered in two datasets: - the Lite dataset: available for commercial and noncommercial usage, containing 25k nature-themed Unsplash photos, 25k keywords, and 1M searches -- the Full dataset: available for noncommercial usage, containing 2M+ high-quality Unsplash photos, 5M keywords, and over 250M searches +- the Full dataset: available for noncommercial usage, containing 3M+ high-quality Unsplash photos, 5M keywords, and over 250M searches As the Unsplash library continues to grow, we’ll release updates to the dataset with new fields and new images, with each subsequent release being [semantically versioned](https://semver.org/). @@ -21,11 +21,11 @@ For more on the Unsplash Dataset, see [our announcement](https://unsplash.com/bl The Lite dataset contains all of the same fields as the Full dataset, but is limited to ~25,000 photos. It can be used for both commercial and non-commercial usage, provided you abide by [the terms](https://github.com/unsplash/datasets/blob/master/TERMS.md). -[⬇️ Download the Lite dataset](https://unsplash.com/data/lite/latest) [~550MB] +[⬇️ Download the Lite dataset](https://unsplash.com/data/lite/latest) [~650MB compressed, ~1.4GB raw] ### Full Dataset -The Full dataset is available for non-commercial usage and all uses must abide by [the terms](https://github.com/unsplash/datasets/blob/master/TERMS.md). To access, please go to [unsplash.com/data](https://unsplash.com/data) and request access. The dataset weighs ~25GB. +The Full dataset is available for non-commercial usage and all uses must abide by [the terms](https://github.com/unsplash/datasets/blob/master/TERMS.md). To access, please go to [unsplash.com/data](https://unsplash.com/data) and request access. The dataset weighs ~20 GB compressed (~43GB raw)). ## Documentation @@ -45,7 +45,7 @@ We're making this data open and available with the hopes of enabling researchers We'd love to see what you create, whether that's a research paper, a machine learning model, a blog post, or just an interesting discovery in the data. Send us an email at [data@unsplash.com](mailto:data@unsplash.com). -If you're using the dataset in a research paper, you can attribute the dataset as `Unsplash Lite Dataset 1.1.0` or `Unsplash Full Dataset 1.1.0` and link to the permalink [`unsplash.com/data`](https://unsplash.com/data). +If you're using the dataset in a research paper, you can attribute the dataset as `Unsplash Lite Dataset 1.2.0` or `Unsplash Full Dataset 1.2.0` and link to the permalink [`unsplash.com/data`](https://unsplash.com/data). ---- diff --git a/how-to/psql/create_tables.sql b/how-to/psql/create_tables.sql index 199bc70..bcad0ef 100644 --- a/how-to/psql/create_tables.sql +++ b/how-to/psql/create_tables.sql @@ -30,7 +30,8 @@ CREATE TABLE unsplash_photos ( ai_primary_landmark_name varchar(255), ai_primary_landmark_latitude float, ai_primary_landmark_longitude float, - ai_primary_landmark_confidence varchar(255) + ai_primary_landmark_confidence varchar(255), + blur_hash varchar(255) ); CREATE TABLE unsplash_keywords ( @@ -44,7 +45,7 @@ CREATE TABLE unsplash_keywords ( CREATE TABLE unsplash_collections ( photo_id varchar(11), - collection_id integer, + collection_id varchar(11), collection_title text, photo_collected_at timestamp, PRIMARY KEY (photo_id, collection_id)