Star Wars RDF Integration Demo
This post describes the data loading of the RDF Integration System (RIS) about Star Wars used in Obi-Wan. The RIS is build on top of three complementary data sources in which only Star Wars data are been kept:
- IMDB partially loads in a relational database (Postgres),
- Wikidata partially loads in RDF graph database (Jena TDB),
- Star Wars API documents loads in JSON documents database (MongoDB).
IMDB
We downloaded the IMDB dumps at 12/03/2020 from https://www.imdb.com/interfaces/#plain. The imdb lacks of character identifiers since they decide to remove it from the database, but they kept the character names. In the demo, we use the IMDB data about Star Wars movies, their writers, directors, producer, actors with their character names.
Data Cleaning
We generate clean data tuple files for the loading:
- we create a new table for the genre
we denormalized the person table, containing a array of character names for each title and actor identifiers, by introducing a tuple for each character name (the pair title,actor is not a key anymore)
import csv with open('title.basics.tsv', newline='') as tsvin, open('genre.tsv', 'w', newline='') as genreout, open('title.tsv', 'w', newline='') as titleout: tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE) genreout = csv.writer(genreout, delimiter='\t') titleout = csv.writer(titleout, delimiter='\t') iterrow = iter(tsvin) next(iterrow) for row in iterrow: ins = row.copy() ins.insert(0, ins[0][2:]) ins = ins[:-1] titleout.writerow(ins) if row[8] != '\\N': title_id = row[0] genres = row[8].split(',') for genre in genres: genreout.writerow([title_id, genre]) with open('title.principals.tsv', newline='') as tsvin, open('principals.tsv', 'w', newline='') as csvout: tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE) csvout = csv.writer(csvout, delimiter='\t') iterrow = iter(tsvin) next(iterrow) for row in iterrow: if row[5] != '\\N': names = row[5][1:-1].split(',') for name in names: ins = row.copy() ins[5] = name[1:-1] csvout.writerow(ins) else: csvout.writerow(row) with open('name.basics.tsv', newline='') as tsvin, open('name.tsv', 'w', newline='') as csvout: tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE) csvout = csv.writer(csvout, delimiter='\t') iterrow = iter(tsvin) next(iterrow) for row in iterrow: csvout.writerow(row[:-2])
Loading
CREATE DATABASE imdb; DROP TABLE IF EXISTS title; CREATE TABLE title ( id integer primary key, tconst varchar(100) default NULL, titleType varchar(100) default NULL, primaryTitle varchar(2000) default NULL, originalTitle varchar(2000) default NULL, isAdult integer, startYear integer, endYear integer, runtimeMinutes integer); COPY title FROM '/home/mburon/téléchargements/data/imdb/title.tsv' WITH DELIMITER E'\t'; CREATE INDEX on title (tconst); DROP TABLE IF EXISTS genre; CREATE TABLE genre ( tconst varchar(100), genre varchar(100) default NULL); COPY genre FROM '/home/mburon/téléchargements/data/imdb/genre.tsv' WITH DELIMITER E'\t'; CREATE INDEX on genre (tconst); DROP TABLE IF EXISTS casting; CREATE TABLE casting ( tconst varchar(100), ordering integer default NULL, nconst varchar(100), category varchar(100) default NULL, job varchar(2000) default NULL, character varchar(2000) default NULL); COPY casting FROM '/home/mburon/téléchargements/data/imdb/principals.tsv' WITH DELIMITER E'\t'; CREATE INDEX on casting (tconst); CREATE INDEX on casting (nconst); DROP TABLE IF EXISTS person; CREATE TABLE person ( nconst varchar(100), primaryName varchar(1000) default NULL, birthYear integer, deathYear integer); COPY person FROM '/home/mburon/téléchargements/data/imdb/name.tsv' WITH DELIMITER E'\t'; CREATE INDEX on person (nconst);
Star Wars Tables
select * from title where tconst in ('tt0076759', 'tt0080684', 'tt0086190', 'tt0120915', 'tt0121765', 'tt0121766', 'tt2488496', 'tt2527336', 'tt2527338', 'tt3748528');
id | tconst | titletype | primarytitle | originaltitle | isadult | startyear | endyear | runtimeminutes |
---|---|---|---|---|---|---|---|---|
86190 | tt0086190 | movie | Star Wars: Episode VI - Return of the Jedi | Star Wars: Episode VI - Return of the Jedi | 0 | 1983 | 131 | |
76759 | tt0076759 | movie | Star Wars: Episode IV - A New Hope | Star Wars | 0 | 1977 | 121 | |
80684 | tt0080684 | movie | Star Wars: Episode V - The Empire Strikes Back | Star Wars: Episode V - The Empire Strikes Back | 0 | 1980 | 124 | |
120915 | tt0120915 | movie | Star Wars: Episode I - The Phantom Menace | Star Wars: Episode I - The Phantom Menace | 0 | 1999 | 136 | |
121765 | tt0121765 | movie | Star Wars: Episode II - Attack of the Clones | Star Wars: Episode II - Attack of the Clones | 0 | 2002 | 142 | |
121766 | tt0121766 | movie | Star Wars: Episode III - Revenge of the Sith | Star Wars: Episode III - Revenge of the Sith | 0 | 2005 | 140 | |
2488496 | tt2488496 | movie | Star Wars: Episode VII - The Force Awakens | Star Wars: Episode VII - The Force Awakens | 0 | 2015 | 138 | |
2527336 | tt2527336 | movie | Star Wars: Episode VIII - The Last Jedi | Star Wars: Episode VIII - The Last Jedi | 0 | 2017 | 152 | |
2527338 | tt2527338 | movie | Star Wars: Episode IX - The Rise of Skywalker | Star Wars: Episode IX - The Rise of Skywalker | 0 | 2019 | 142 | |
3748528 | tt3748528 | movie | Rogue One: A Star Wars Story | Rogue One | 0 | 2016 | 133 |
CREATE TABLE swTitle AS select * from title where tconst in ('tt0076759', 'tt0080684', 'tt0086190', 'tt0120915', 'tt0121765', 'tt0121766', 'tt2488496', 'tt2527336', 'tt2527338', 'tt3748528');
CREATE TABLE swGenre AS select genre.tconst, genre.genre from genre, swTitle where genre.tconst = swTitle.tconst;
DROP TABLE swCasting; CREATE TABLE swCasting AS select distinct casting.tconst, casting.ordering, casting.nconst, casting.category, casting.job, casting.character from casting, swTitle where casting.tconst = swTitle.tconst;
insert missing values in the casting
INSERT INTO swCasting VALUES ('tt0086190', 11, 'nm0000469', 'actor', NULL, 'Darth Vader'); INSERT INTO swCasting VALUES ('tt0076759', 11, 'nm0000469', 'actor', NULL, 'Darth Vader'); INSERT INTO swCasting VALUES ('tt0080684', 11, 'nm0000469', 'actor', NULL, 'Darth Vader'); INSERT INTO swCasting VALUES ('tt3748528', 11, 'nm1872855', 'actor', NULL, 'Darth Vader');
Drop table swPerson; CREATE TABLE swPerson AS select distinct person.nconst, person.primaryname, person.birthyear, person.deathyear from person, swCasting where swCasting.nconst = person.nconst;
select * from swCasting;
Wikidata
We downloaded some triples from Wikidata about Star Wars characters (e.g. Obi-Wan Kenobi), like their performers, their organisation memberships, their relation with others characters (student of, enemy of, child, spouse).
>sw.full.nt while read i; do if [[ -n $i ]] && [[ ${i:0:1} != "#" ]] then curl https://www.wikidata.org/wiki/Special:EntityData/${i}.nt >> sw.full.nt fi done <wikidata-sw-entity
We keep only certain interesting properties.
>sw.nt #IMDB character/name/title ID grep " <http://www.wikidata.org/prop/statement/P345> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/P345> " sw.full.nt >> sw.nt #Star Wars Databank ID grep " <http://www.wikidata.org/prop/statement/P3798> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/P3798> " sw.full.nt >> sw.nt # performer property grep " <http://www.wikidata.org/prop/P175> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/statement/P175> " sw.full.nt >> sw.nt # memberOf property grep " <http://www.wikidata.org/prop/P463> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/statement/P463> " sw.full.nt >> sw.nt # student grep " <http://www.wikidata.org/prop/P802> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/statement/P802> " sw.full.nt >> sw.nt # enemy of grep " <http://www.wikidata.org/prop/P7047> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/statement/P7047> " sw.full.nt >> sw.nt # child grep " <http://www.wikidata.org/prop/P40> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/statement/P40> " sw.full.nt >> sw.nt # spouse grep " <http://www.wikidata.org/prop/P26> " sw.full.nt >> sw.nt grep " <http://www.wikidata.org/prop/statement/P26> " sw.full.nt >> sw.nt while read i; do if [[ -n $i ]] && [[ ${i:0:1} != "#" ]] then grep "<http://www.wikidata.org/entity/${i}> <http://www.w3.org/2000/01/rdf-schema#label>" sw.full.nt | grep "@en ." >> sw.nt fi done <wikidata-sw-entity
Star Wars API
We downloaded also the JSON documents from a Star Wars API, describing character species, spaceships and their pilots etc.
Download
for i in {1..75}; do curl https://swapi.co/api/starships/$i/ >> starships.json done for i in {1..75}; do curl https://swapi.co/api/vehicles/$i/ >> vehicles.json done for i in {1..87}; do curl https://swapi.co/api/people/$i/ >> people.json done for i in {1..37}; do curl https://swapi.co/api/species/ >> species.json done
Insertion of People IRI
We insert formated name of people to create IRI of the form https://www.starwars.com/databank/luke-skywalker
import json from json import JSONDecoder, JSONDecodeError import re NOT_WHITESPACE = re.compile(r'[^\s]') def decode_stacked(document, pos=0, decoder=JSONDecoder()): while True: match = NOT_WHITESPACE.search(document, pos) if not match: return pos = match.start() try: obj, pos = decoder.raw_decode(document, pos) except JSONDecodeError: # do something sensible if there's some error raise yield obj with open('people.json', newline='') as f, open('people-iri.json', 'w', newline='') as out: for obj in decode_stacked(f.read()): if 'name' in obj: formated_name = obj['name'].lower().replace(" ", "-") obj['formatedName'] = formated_name json.dump(obj, out) out.write('\n')
Loading in MongoDB
We insert 37 starships and 86 people
mongoimport --drop -c=people -d=swapi --mode=merge --file=people.json mongoimport --drop -c=starships -d=swapi --mode=merge --file=starships.json mongoimport --drop -c=vehicles -d=swapi --mode=merge --file=vehicles.json mongoimport --drop -c=species -d=swapi --mode=merge --file=species.json