query-0f5a7bcc411e42b3ff44b2775bf635cb
Article titles and corresponding lexemes (or "placeholder")
Use at
- https://query.wikidata.org/sparql
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX llemma: <http://wikiba.se/ontology#lemma>
PREFIX lcat: <http://wikiba.se/ontology#lexicalCategory>
PREFIX llang: <http://purl.org/dc/terms/language>
PREFIX lforms: <http://www.w3.org/ns/lemon/ontolex#lexicalForm>
PREFIX lhasform: <http://www.w3.org/ns/lemon/ontolex#representation>
PREFIX lfeature: <http://wikiba.se/ontology#grammaticalFeature>
PREFIX lsenses: <http://www.w3.org/ns/lemon/ontolex#sense>
PREFIX lglos: <http://www.w3.org/2004/02/skos/core#definition>
# List words of in some titles and corresponding lexemes (or "placeholder")
# limited to the 42 first words & English language titles
# includes glos for any of the senses already defined on one of the lexemes (more generally needed)
# by Jura1, 10 March 2020
SELECT
?a ?title ?nv ?word (GROUP_CONCAT(DISTINCT ?lemma; separator="/") as ?all_lemma)
(SAMPLE(?lexeme) as ?a_lexeme)
(GROUP_CONCAT(DISTINCT ?lexcatLabel; separator=", ") as ?all_lexcat)
(COUNT(DISTINCT ?sense) as ?senses)
(GROUP_CONCAT(DISTINCT ?glos_en; separator=" [other sense: ]_") as ?all_glos_en)
WHERE
{
{ SELECT * {
{ SELECT ?a ?title (REPLACE(CONCAT(REPLACE(REPLACE(REPLACE(str(?title),'[\\.:,\\? %";\\]\\[]',""),"[\\(\\)–“—”]"," "),"’","'"),
" Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z")
,"[ ]{2,100}"," ") as ?input)
#skip some characters in title
#pad with " Z" in case the title is shorter than 42 words. "Z" is filtered out later.
WHERE
{
# { ?a wdt:P921 wd:Q84263196 } UNION { ?a wdt:P921 wd:Q81068910 } UNION { ?a wdt:P921 wd:Q57751738 }
# UNION { ?a wdt:P921 wd:Q1134583 } UNION { ?a wdt:P921 wd:Q82069695 } UNION { ?a wdt:P921 wd:Q290805 }
# above for COVID-19, use the following for ZIKA instead, add other items as needed
?a wdt:P921 wd:Q202864 .
?a wdt:P1476 ?title . FILTER(lang(?title)="en") } OFFSET 1000 LIMIT 200
}
{ SELECT ?re ?nv WHERE { ?n wdt:P5176 []; wdt:P1181 ?nv . FILTER( ?nv < 43)
BIND( CONCAT( "^([^ ]+ ){", str( ?nv - 1 ),"}([^ ]+) .*") as ?re) }
} BIND( strlang(REPLACE(?input, ?re, "$2"),"en") as ?word) }
}
FILTER ( ?word != "Z"@en )
# check against English forms of lexemes ("en")
OPTIONAL { ?lf lhasform: ?word . ?lexeme1 lforms: ?lf ; llang: wd:Q1860 }
# check against lowercase forms
BIND( Lcase(?word) as ?lword)
OPTIONAL { ?lf lhasform: ?lword . ?lexeme2 lforms: ?lf ; llang: wd:Q1860 }
# check against noun forms with 's (incomplete, not en-gb, not different cases)
BIND( strlang(REPLACE(str(?word),"^(.+)'s$", "$1"),"en") as ?noun)
OPTIONAL { ?lf lhasform: ?noun . ?lexeme5 lforms: ?lf ; llang: wd:Q1860 ; lcat: wd:Q1084 }
# check against British-English lowercase forms ("en-gb")
BIND( strlang(Lcase(str(?word)),"en-gb") as ?lword_engb)
OPTIONAL { ?lf lhasform: ?lword_engb . ?lexeme3 lforms: ?lf ; llang: wd:Q1860 }
# exclude digits-only strings and 1-letter strings
FILTER ( !REGEX( str(?word), "^\\d+$" ) )
FILTER ( strlen( stR(?word)) != 1 )
BIND( COALESCE( ?lexeme1, ?lexeme2, ?lexeme5, ?lexeme3, wd:L254535 ) as ?lexeme)
?lexeme llemma: ?lemma ; lcat: ?lexcat .
?lexcat rdfs:label ?lexcatLabel . FILTER(lang(?lexcatLabel) = "en") .
OPTIONAL { ?lexeme lsenses: ?sense .
OPTIONAL { ?sense lglos: ?glos_en . FILTER(lang(?glos_en) = "en") }
}
}
GROUP BY ?a ?title ?nv ?word
ORDER BY ?a ?nv
Query found at
graph TD
classDef projected fill:lightgreen;
classDef literal fill:orange;
classDef iri fill:yellow;
v1("?a"):::projected
v23("?a_lexeme")
v26("?all_glos_en")
v22("?all_lemma")
v24("?all_lexcat")
v20("?glos_en"):::projected
v6("?input")
v18("?lemma"):::projected
v19("?lexcat")
v3("?lexcatLabel"):::projected
v17("?lexeme"):::projected
v10("?lexeme1")
v12("?lexeme2")
v16("?lexeme3")
v14("?lexeme5")
v9("?lf")
v11("?lword")
v15("?lword_engb")
v7("?n")
v13("?noun")
v2("?nv"):::projected
v8("?re")
v21("?sense"):::projected
v25("?senses")
v5("?title"):::projected
v9("?word"):::projected
a1((" "))
c6(["wd:Q202864"]):::iri
c16(["wd:Q1084"]):::iri
c14(["wd:Q1860"]):::iri
f0[["?lexcatLabel = 'en'"]]
f0 --> v3
f1[["string-length(str(?word)) != '1^^xsd:integer'"]]
f1 --> v9
f2[["not regex(str(?word),'^\d+$')"]]
f2 --> v9
f3[["?word != sZ^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#langString>'"]]
f3 --> v9
f4[["?title = 'en'"]]
f4 --> v5
v1 --"wdt:P921"--> c6
v1 --"wdt:P1476"--> v5
bind5[/"replace(concat(replace(replace(replace(str(?title),'#91;\.:,\? %";\#93;\#91;#93;',''),'#91;\(\)–“—”#93;',' '),'’','''),' Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z'),'#91; #93;{2,100}',' ')"/]
v5 --o bind5
bind5 --as--o v6
f6[["?nv < '43^^xsd:integer'"]]
f6 --> v2
v7 --"wdt:P5176"--> a1
v7 --"wdt:P1181"--> v2
bind7[/"concat('^(#91;^ #93;+ ){',str(?nv - '1^^xsd:integer'),'}(#91;^ #93;+) .*')"/]
v2 --o bind7
bind7 --as--o v8
bind8[/"STRLANG(replace(?input,?re,'$2'),'en')"/]
v6 --o bind8
v8 --o bind8
bind8 --as--o v9
subgraph optional0["(optional)"]
style optional0 fill:#bbf,stroke-dasharray: 5 5;
v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v9
v10 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9
v10 --http://purl.org/dc/terms/language--> c14
end
bind9[/"lower-case(?word)"/]
v9 --o bind9
bind9 --as--o v11
subgraph optional1["(optional)"]
style optional1 fill:#bbf,stroke-dasharray: 5 5;
v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v11
v12 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9
v12 --http://purl.org/dc/terms/language--> c14
end
bind10[/"STRLANG(replace(str(?word),'^(.+)'s$','$1'),'en')"/]
v9 --o bind10
bind10 --as--o v13
subgraph optional2["(optional)"]
style optional2 fill:#bbf,stroke-dasharray: 5 5;
v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v13
v14 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9
v14 --http://purl.org/dc/terms/language--> c14
v14 --http://wikiba.se/ontology#lexicalCategory--> c16
end
bind11[/"STRLANG(lower-case(str(?word)),'en-gb')"/]
v9 --o bind11
bind11 --as--o v15
subgraph optional3["(optional)"]
style optional3 fill:#bbf,stroke-dasharray: 5 5;
v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v15
v16 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9
v16 --http://purl.org/dc/terms/language--> c14
end
bind12[/"?lexeme1?lexeme2?lexeme5?lexeme3'wd:L254535'"/]
v10 --o bind12
v12 --o bind12
v14 --o bind12
v16 --o bind12
bind12 --as--o v17
v17 --http://wikiba.se/ontology#lemma--> v18
v17 --http://wikiba.se/ontology#lexicalCategory--> v19
v19 --"rdfs:label"--> v3
subgraph optional4["(optional)"]
style optional4 fill:#bbf,stroke-dasharray: 5 5;
v17 -.http://www.w3.org/ns/lemon/ontolex#sense.-> v21
subgraph optional5["(optional)"]
style optional5 fill:#bbf,stroke-dasharray: 5 5;
v21 -.http://www.w3.org/2004/02/skos/core#definition.-> v20
end
end
bind18[/"?lemma"/]
v18 --o bind18
bind18 --as--o v22
bind19[/"sample(?lexeme)"/]
v17 --o bind19
bind19 --as--o v23
bind20[/"?lexcatLabel"/]
v3 --o bind20
bind20 --as--o v24
bind21[/"count(?sense)"/]
v21 --o bind21
bind21 --as--o v25
bind22[/"?glos_en"/]
v20 --o bind22
bind22 --as--o v26