diff --git a/.DS_Store b/.DS_Store index 852ed5f6c152d397b97919b512f0305fac8ffae5..da0c60719f0a6e03b7bbf804258f69d11bc66ed4 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/spotify_million_playlist_dataset_challenge/README.md b/spotify_million_playlist_dataset_challenge/README.md new file mode 100644 index 0000000000000000000000000000000000000000..863dbb9e431f57db1b6a3b51cd60f43a50e4b0a2 --- /dev/null +++ b/spotify_million_playlist_dataset_challenge/README.md @@ -0,0 +1,114 @@ +# Challenge Set +Version 1, February 13, 2018 +(Documentation updated August 5, 2020) + +This is the challenge set for the Spotify Million Playlist Dataset Challenge. + +This challenge set contains 10,000 incomplete playlists. The challenge +is to recommend tracks for each of these playlists. See +[https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge) for +challenge details. + +## Format +The challenge set consists of a single JSON dictionary with three fields: + + * **date** - the date the challenge set was generated. This should be "2018-01-16 08:47:28.198015" + * **version** - the version of the challenge set. This should be "v1" + * **playlists** - an array of 10,000 incomplete playlists. Each element in this array contains the following fields: + * **pid** - the playlist ID + * **name** - (optional) - the name of the playlist. For some challenge playlists, the name will be missing. + * **num_holdouts** - the number of tracks that have been omitted from the playlist + * **tracks** - a (possibly empty) array of tracks that are in the playlist. Each element of this array contains the following fields: + * **pos** - the position of the track in the playlist (zero offset) + * **track_name** - the name of the track + * **track_uri** - the Spotify URI of the track + * **artist_name** - the name of the primary artist of the track + * **artist_uri** - the Spotify URI of the primary artist of the track + * **album_name** - the name of the album that the track is on + * **album_uri** -- the Spotify URI of the album that the track is on + * **duration_ms** - the duration of the track in milliseconds + * **num_samples** the number of tracks included in the playlist + * **num_tracks** - the total number of tracks in the playlist. + + Note that len(tracks) == num\_samples and num\_samples + num\_holdouts == num\_tracks + +## Playlist Challenge Categories +The 10,000 playlists are made up of 10 different challenge categories, with 1,000 playlists in each category: + + 1. Predict tracks for a playlist given its title only + 2. Predict tracks for a playlist given its title and the first track + 3. Predict tracks for a playlist given its title and the first 5 tracks + 4. Predict tracks for a playlist given its first 5 tracks (no title) + 5. Predict tracks for a playlist given its title and the first 10 tracks + 6. Predict tracks for a playlist given its first ten tracks (no title) + 7. Predict tracks for a playlist given its title and the first 25 tracks + 8. Predict tracks for a playlist given its title and 25 random tracks + 9. Predict tracks for a playlist given its title and the first 100 tracks + 10. Predict tracks for a playlist given its title and 100 random tracks + + +## How the challenge set was built +The playlists in the challenge set are selected using the same criteria used to select playlists for +the full Million Playlist Dataset (MPD). See the README.md file in the MPD distribution +for more details on how the playlists were selected. Additionally, playlists in the challenge set meet +the following constraints: + + * All tracks in the challenge set appear in the MPD + * All holdout tracks appear in the MPD + + +## Tools +Scripts for checking and verifying submissions + ++ check.py - checks the challenge set to make sure that it is internally consistent and properly formatted. ++ verify_submission.py - verifies that a given challenge submission is properly formatted + + +## Verifying the challenge set +To verify that you have an uncorrupted challenge set you can check its md5. E.g. + + % md5sum --check md5 + + challenge_set.json: OK + +Use check.py to verify that the challenge set is internally consistent. + + % python check.py + + stats: + tests: 4634003 + errors: 0 + + challenge_set.json is OK + + +## Sample Submission +Included in the challenge set is a sample challenge submission: + + sample_submission.csv + +This sample shows the expected format for your submission to the challenge. Your +submssion should follow the following rules: + + * All fields are comma separated. It is ok, but optional to have whitespace before and after the comma. + * Comments are allowed with a '#' at the beginning of a line. + * Empty lines are ok (they are ignored). + * The first non-commented/blank line must start with "team_info" and then include the team name, + and contact email address. + * For each challenge playlist there must be a line of the form: + pid, trackuri\_1, trackuri\_2, trackuri\_3, ..., trackuri\_499, trackuri\_500 with exactly 500 tracks. + * The seed tracks, provided as part of the challenge set for any particular playlist, + must *not* be included in the submission for that playlist. + * The submission for a particular playlist must *not* contain duplicated tracks. + * The submission for a particular playlist must have exactly 500 tracks. + * Any submission violating one of the rules will be rejected by the scoring system. + +'pid' is the playlist id of the challenge playlist + +Before submission, the csv should be gzipped. + +You can verify that your submission is in the proper format as follows: + + python verify_submission.py challenge_set.json sample_submission.csv + + diff --git a/spotify_million_playlist_dataset_challenge/challenge_set.json b/spotify_million_playlist_dataset_challenge/challenge_set.json new file mode 100644 index 0000000000000000000000000000000000000000..adb6b5304d541f814d428df2408c5d1dcab1d836 Binary files /dev/null and b/spotify_million_playlist_dataset_challenge/challenge_set.json differ diff --git a/spotify_million_playlist_dataset_challenge/check.py b/spotify_million_playlist_dataset_challenge/check.py new file mode 100644 index 0000000000000000000000000000000000000000..a60d6a1017b2218dafc008214f12ede38e3daee1 --- /dev/null +++ b/spotify_million_playlist_dataset_challenge/check.py @@ -0,0 +1,109 @@ +""" + checks to make sure that the challenge_set is internally consistent + + usage: python check.py challenge_set.json +""" + +import sys +import json + +stats = { + "tests": 0, + "errors": 0, +} + +required_playlist_fields = [ + "num_holdouts", + "pid", + "num_tracks", + "tracks", + "num_samples", +] +optional_playlist_fields = ["name"] + required_playlist_fields + +track_fields = set( + [ + "pos", + "artist_name", + "artist_uri", + "track_uri", + "track_name", + "album_uri", + "album_name", + "duration_ms", + ] +) + + +def check_challenge_set(path): + with open(path) as f: + js = f.read() + challenge_set = json.loads(js) + + tassert(challenge_set["version"] == "v1", "proper version") + tassert(len(challenge_set["playlists"]) == 10000, "proper number of playlists") + + known_ids = set() + unique_tracks = set() + unique_albums = set() + unique_artists = set() + total_tracks = 0 + for playlist in challenge_set["playlists"]: + ntracks = playlist["num_samples"] + playlist["num_holdouts"] + tassert(playlist["pid"] not in known_ids, "unique pid") + tassert(ntracks == playlist["num_tracks"], "consistent num_tracks") + tassert( + playlist["num_samples"] == len(playlist["tracks"]), "consistent num_samples" + ) + known_ids.add(playlist["pid"]) + + for field, val in list(playlist.items()): + tassert(field in optional_playlist_fields, "valid playlist field") + + for f in required_playlist_fields: + tassert(f in list(playlist.keys()), "missing required play list field " + f) + + for track in playlist["tracks"]: + for field, val in list(track.items()): + tassert(field in track_fields, "valid track field") + for f in track_fields: + tassert(f in list(track.keys()), "missing required track field " + f) + + unique_tracks.add(track["track_uri"]) + unique_albums.add(track["album_uri"]) + unique_artists.add(track["artist_uri"]) + total_tracks += 1 + + tassert(len(known_ids) == 10000, "proper number of unqiue IDs") + + print() + print("stats:") + for k, v in list(stats.items()): + print("%s: %d" % (k, v)) + print() + + print("total playlists:", len(challenge_set["playlists"])) + print("total tracks: ", total_tracks) + print("unique tracks: ", len(unique_tracks)) + print("unique albums: ", len(unique_albums)) + print("unique artists: ", len(unique_artists)) + print() + + if stats["errors"] == 0: + print("challenge_set.json is OK") + else: + print("challenge_set.json has errors") + + +def tassert(cond, text): + stats["tests"] += 1 + if not cond: + stats["errors"] += 1 + print("error:" + text) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: check,py challenge_set.json") + else: + check_challenge_set(sys.argv[1]) diff --git a/spotify_million_playlist_dataset_challenge/md5 b/spotify_million_playlist_dataset_challenge/md5 new file mode 100644 index 0000000000000000000000000000000000000000..08fd986aefdaee03503d61d208dc4960e73ac553 --- /dev/null +++ b/spotify_million_playlist_dataset_challenge/md5 @@ -0,0 +1 @@ +44fc1aeb1116da7a7607922d4b53b3b5 challenge_set.json diff --git a/spotify_million_playlist_dataset_challenge/sample_submission.csv b/spotify_million_playlist_dataset_challenge/sample_submission.csv new file mode 100644 index 0000000000000000000000000000000000000000..c33e997e05711a83e4b7e398787ea6784b00719e Binary files /dev/null and b/spotify_million_playlist_dataset_challenge/sample_submission.csv differ diff --git a/spotify_million_playlist_dataset_challenge/verify_submission.py b/spotify_million_playlist_dataset_challenge/verify_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..d7932693983bed0298e268fae1b808848d37345f --- /dev/null +++ b/spotify_million_playlist_dataset_challenge/verify_submission.py @@ -0,0 +1,142 @@ +""" + Verifies that a given challenge submision is properly constructed. + + Usage: + + python verify_submission.py challenge_set.json submission.csv +""" +import sys +import json + +NTRACKS = 500 + + +def verify_submission(challenge_path, submission_path): + has_team_info = False + error_count = 0 + + try: + f = open(challenge_path) + js = f.read() + f.close() + challenge = json.loads(js) + except FileNotFoundError: + error_count += 1 + print("Can't read the challenge set") + return error_count + + pids = set([playlist["pid"] for playlist in challenge["playlists"]]) + if len(challenge["playlists"]) != 10000: + print("Bad challenge set") + error_count += 1 + + # seed_tracks contains seed tracks for each challenge playlist + seed_tracks = {} + for playlist in challenge["playlists"]: + track_uris = [track["track_uri"] for track in playlist["tracks"]] + seed_tracks[playlist["pid"]] = set(track_uris) + + found_pids = set() + + if error_count > 0: + return error_count + + f = open(submission_path) + for line_no, line in enumerate(f): + line = line.strip() + if not line: + continue + if line[0] == "#": + continue + + if not has_team_info: + if line.startswith("team_info"): + has_team_info = True + tinfo = line.split(",") + else: + print("missing team_info at line", line_no) + error_count += 1 + + else: + fields = line.split(",") + fields = [f.strip() for f in fields] + try: + pid = int(fields[0]) + except ValueError: + print("bad pid (should be an integer)", fields[0], "at line", line_no) + error_count += 1 + continue + tracks = fields[1:] + found_pids.add(pid) + if not pid in pids: + print("bad pid", pid, "at line", line_no) + error_count += 1 + if len(tracks) != NTRACKS: + print( + "wrong number of tracks, found", + len(tracks), + "should have", + NTRACKS, + "at", + line_no, + ) + error_count += 1 + if len(set(tracks)) != NTRACKS: + print( + "wrong number of unique tracks, found", + len(set(tracks)), + "should have", + NTRACKS, + "at", + line_no, + ) + error_count += 1 + + if seed_tracks[pid].intersection(set(tracks)): + print( + "found seed tracks in the submission for playlist", + pid, + "at", + line_no, + ) + error_count += 1 + + for uri in tracks: + if not is_track_uri(uri): + print("bad track uri", uri, "at", line_no) + error_count += 1 + + if len(found_pids) != len(pids): + print( + "wrong number of playlists, found", len(found_pids), "expected", len(pids) + ) + error_count += 1 + + return error_count + + +def is_track_uri(uri): + fields = uri.split(":") + return ( + len(fields) == 3 + and fields[0] == "spotify" + and fields[1] == "track" + and len(fields[2]) == 22 + ) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("usage: python verify_submission.py challenge_set.json submission.csv") + sys.exit() + errors = verify_submission(sys.argv[1], sys.argv[2]) + if errors == 0: + print( + "Submission is OK! Remember to gzip your submission before submitting it to the challenge." + ) + else: + print( + "Your submission has", + errors, + "errors. If you submit it, it will be rejected.", + )