Star Wars RDF Integration Demo

This post describes the data loading of the RDF Integration System (RIS) about Star Wars used in Obi-Wan. The RIS is build on top of three complementary data sources in which only Star Wars data are been kept:

IMDB partially loads in a relational database (Postgres),
Wikidata partially loads in RDF graph database (Jena TDB),
Star Wars API documents loads in JSON documents database (MongoDB).

IMDB

We downloaded the IMDB dumps at 12/03/2020 from https://www.imdb.com/interfaces/#plain. The imdb lacks of character identifiers since they decide to remove it from the database, but they kept the character names. In the demo, we use the IMDB data about Star Wars movies, their writers, directors, producer, actors with their character names.

Data Cleaning

We generate clean data tuple files for the loading:

we create a new table for the genre

we denormalized the person table, containing a array of character names for each title and actor identifiers, by introducing a tuple for each character name (the pair title,actor is not a key anymore)

import csv

with open('title.basics.tsv', newline='') as tsvin, open('genre.tsv', 'w', newline='') as genreout, open('title.tsv', 'w', newline='') as titleout:
    tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE)
    genreout = csv.writer(genreout, delimiter='\t')
    titleout = csv.writer(titleout, delimiter='\t')

    iterrow = iter(tsvin)
    next(iterrow)
    for row in iterrow:
        ins = row.copy()
        ins.insert(0, ins[0][2:])
        ins = ins[:-1]
        titleout.writerow(ins)
        if row[8] != '\\N':
            title_id = row[0]
            genres = row[8].split(',')
            for genre in genres:
                genreout.writerow([title_id, genre])

with open('title.principals.tsv', newline='') as tsvin, open('principals.tsv', 'w', newline='') as csvout:
    tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE)
    csvout = csv.writer(csvout, delimiter='\t')

    iterrow = iter(tsvin)
    next(iterrow)
    for row in iterrow:
        if row[5] != '\\N':
            names = row[5][1:-1].split(',')
            for name in names:
                ins = row.copy()
                ins[5] = name[1:-1]
                csvout.writerow(ins)
        else:
            csvout.writerow(row)

with open('name.basics.tsv', newline='') as tsvin, open('name.tsv', 'w', newline='') as csvout:
    tsvin = csv.reader(tsvin, delimiter='\t', quoting=csv.QUOTE_NONE)
    csvout = csv.writer(csvout, delimiter='\t')

    iterrow = iter(tsvin)
    next(iterrow)
    for row in iterrow:
        csvout.writerow(row[:-2])

Loading

CREATE DATABASE imdb;

DROP TABLE IF EXISTS title;
CREATE TABLE title (
id integer primary key,
tconst varchar(100) default NULL,
titleType varchar(100) default NULL,
primaryTitle varchar(2000) default NULL,
originalTitle varchar(2000) default NULL,
isAdult integer,
startYear integer,
endYear integer,
runtimeMinutes integer);

COPY title FROM '/home/mburon/téléchargements/data/imdb/title.tsv' WITH DELIMITER E'\t';

CREATE INDEX on title (tconst);

DROP TABLE IF EXISTS genre;
CREATE TABLE genre (
tconst varchar(100),
genre varchar(100) default NULL);

COPY genre FROM '/home/mburon/téléchargements/data/imdb/genre.tsv' WITH DELIMITER E'\t';

CREATE INDEX on genre (tconst);

DROP TABLE IF EXISTS casting;
CREATE TABLE casting (
tconst varchar(100),
ordering integer default NULL,
nconst varchar(100),
category varchar(100) default NULL,
job varchar(2000) default NULL,
character varchar(2000) default NULL);

COPY casting FROM '/home/mburon/téléchargements/data/imdb/principals.tsv' WITH DELIMITER E'\t';

CREATE INDEX on casting (tconst);
CREATE INDEX on casting (nconst);

DROP TABLE IF EXISTS person;
CREATE TABLE person (
nconst varchar(100),
primaryName varchar(1000) default NULL,
birthYear integer,
deathYear integer);

COPY person FROM '/home/mburon/téléchargements/data/imdb/name.tsv' WITH DELIMITER E'\t';
CREATE INDEX on person (nconst);

Star Wars Tables

select * from title where tconst in ('tt0076759', 'tt0080684', 'tt0086190', 'tt0120915', 'tt0121765', 'tt0121766', 'tt2488496', 'tt2527336', 'tt2527338', 'tt3748528');

id	tconst	titletype	primarytitle	originaltitle	startyear	runtimeminutes
86190	tt0086190	movie	Star Wars: Episode VI - Return of the Jedi	Star Wars: Episode VI - Return of the Jedi	1983	131
76759	tt0076759	movie	Star Wars: Episode IV - A New Hope	Star Wars	1977	121
80684	tt0080684	movie	Star Wars: Episode V - The Empire Strikes Back	Star Wars: Episode V - The Empire Strikes Back	1980	124
120915	tt0120915	movie	Star Wars: Episode I - The Phantom Menace	Star Wars: Episode I - The Phantom Menace	1999	136
121765	tt0121765	movie	Star Wars: Episode II - Attack of the Clones	Star Wars: Episode II - Attack of the Clones	2002	142
121766	tt0121766	movie	Star Wars: Episode III - Revenge of the Sith	Star Wars: Episode III - Revenge of the Sith	2005	140
2488496	tt2488496	movie	Star Wars: Episode VII - The Force Awakens	Star Wars: Episode VII - The Force Awakens	2015	138
2527336	tt2527336	movie	Star Wars: Episode VIII - The Last Jedi	Star Wars: Episode VIII - The Last Jedi	2017	152
2527338	tt2527338	movie	Star Wars: Episode IX - The Rise of Skywalker	Star Wars: Episode IX - The Rise of Skywalker	2019	142
3748528	tt3748528	movie	Rogue One: A Star Wars Story	Rogue One	2016	133

CREATE TABLE swTitle AS 
select * from title where tconst in ('tt0076759', 'tt0080684', 'tt0086190', 'tt0120915', 'tt0121765', 'tt0121766', 'tt2488496', 'tt2527336', 'tt2527338', 'tt3748528');

CREATE TABLE swGenre AS 
select genre.tconst, genre.genre from genre, swTitle where genre.tconst = swTitle.tconst;

DROP TABLE swCasting;
CREATE TABLE swCasting AS 
select distinct casting.tconst, casting.ordering, casting.nconst, casting.category, casting.job, casting.character from casting, swTitle where casting.tconst = swTitle.tconst;

insert missing values in the casting

INSERT INTO swCasting 
VALUES ('tt0086190', 11, 'nm0000469', 'actor', NULL, 'Darth Vader');
INSERT INTO swCasting 
VALUES ('tt0076759', 11, 'nm0000469', 'actor', NULL, 'Darth Vader');
INSERT INTO swCasting 
VALUES ('tt0080684', 11, 'nm0000469', 'actor', NULL, 'Darth Vader');
INSERT INTO swCasting 
VALUES ('tt3748528', 11, 'nm1872855', 'actor', NULL, 'Darth Vader');

Drop table swPerson;
CREATE TABLE swPerson AS 
select distinct person.nconst, person.primaryname, person.birthyear, person.deathyear from person, swCasting where swCasting.nconst = person.nconst;

select * from swCasting;

Wikidata

We downloaded some triples from Wikidata about Star Wars characters (e.g. Obi-Wan Kenobi), like their performers, their organisation memberships, their relation with others characters (student of, enemy of, child, spouse).

>sw.full.nt

while read i; do
    if [[ -n $i ]] && [[ ${i:0:1} != "#" ]]
    then
       curl https://www.wikidata.org/wiki/Special:EntityData/${i}.nt >> sw.full.nt
    fi
done <wikidata-sw-entity

We keep only certain interesting properties.

>sw.nt

#IMDB character/name/title ID
grep " <http://www.wikidata.org/prop/statement/P345> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/P345> " sw.full.nt >> sw.nt

#Star Wars Databank ID
grep " <http://www.wikidata.org/prop/statement/P3798> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/P3798> " sw.full.nt >> sw.nt

# performer property 
grep " <http://www.wikidata.org/prop/P175> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/statement/P175> " sw.full.nt >> sw.nt

# memberOf property 
grep " <http://www.wikidata.org/prop/P463> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/statement/P463> " sw.full.nt >> sw.nt

# student 
grep " <http://www.wikidata.org/prop/P802> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/statement/P802> " sw.full.nt >> sw.nt

# enemy of
grep " <http://www.wikidata.org/prop/P7047> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/statement/P7047> " sw.full.nt >> sw.nt

# child
grep " <http://www.wikidata.org/prop/P40> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/statement/P40> " sw.full.nt >> sw.nt

# spouse
grep " <http://www.wikidata.org/prop/P26> " sw.full.nt >> sw.nt
grep " <http://www.wikidata.org/prop/statement/P26> " sw.full.nt >> sw.nt


  while read i; do
      if [[ -n $i ]] && [[ ${i:0:1} != "#" ]]
      then
          grep "<http://www.wikidata.org/entity/${i}> <http://www.w3.org/2000/01/rdf-schema#label>" sw.full.nt | grep "@en ." >> sw.nt
      fi
  done <wikidata-sw-entity

Star Wars API

We downloaded also the JSON documents from a Star Wars API, describing character species, spaceships and their pilots etc.

Download

for i in {1..75};
do 
curl https://swapi.co/api/starships/$i/ >> starships.json
done

for i in {1..75};
do 
curl https://swapi.co/api/vehicles/$i/ >> vehicles.json
done

for i in {1..87};
do
curl https://swapi.co/api/people/$i/ >> people.json
done

for i in {1..37};
do
curl https://swapi.co/api/species/ >> species.json
done

Insertion of People IRI

We insert formated name of people to create IRI of the form https://www.starwars.com/databank/luke-skywalker

import json
from json import JSONDecoder, JSONDecodeError
import re

NOT_WHITESPACE = re.compile(r'[^\s]')

def decode_stacked(document, pos=0, decoder=JSONDecoder()):
    while True:
        match = NOT_WHITESPACE.search(document, pos)
        if not match:
            return
        pos = match.start()

        try:
            obj, pos = decoder.raw_decode(document, pos)
        except JSONDecodeError:
            # do something sensible if there's some error
            raise
        yield obj

with open('people.json', newline='') as f, open('people-iri.json', 'w', newline='') as out:
    for obj in decode_stacked(f.read()):
        if 'name' in obj:
            formated_name = obj['name'].lower().replace(" ", "-")
            obj['formatedName'] = formated_name
            json.dump(obj, out)
            out.write('\n')

Loading in MongoDB

We insert 37 starships and 86 people

mongoimport --drop -c=people -d=swapi --mode=merge --file=people.json
mongoimport --drop -c=starships -d=swapi --mode=merge --file=starships.json
mongoimport --drop -c=vehicles -d=swapi --mode=merge --file=vehicles.json
mongoimport --drop -c=species -d=swapi --mode=merge --file=species.json