query-0f5a7bcc411e42b3ff44b2775bf635cb

Article titles and corresponding lexemes (or "placeholder")

Use at

https://query.wikidata.org/sparql

PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX llemma:   <http://wikiba.se/ontology#lemma>
PREFIX lcat:     <http://wikiba.se/ontology#lexicalCategory>
PREFIX llang:    <http://purl.org/dc/terms/language>
PREFIX lforms:   <http://www.w3.org/ns/lemon/ontolex#lexicalForm>
PREFIX lhasform: <http://www.w3.org/ns/lemon/ontolex#representation>
PREFIX lfeature: <http://wikiba.se/ontology#grammaticalFeature>
PREFIX lsenses:  <http://www.w3.org/ns/lemon/ontolex#sense>
PREFIX lglos:    <http://www.w3.org/2004/02/skos/core#definition>

# List words of in some titles and corresponding lexemes (or "placeholder") 
#  limited to the 42 first words & English language titles
#  includes glos for any of the senses already defined on one of the lexemes (more generally needed)
#  by Jura1, 10 March 2020

SELECT
  ?a ?title ?nv ?word (GROUP_CONCAT(DISTINCT ?lemma; separator="/") as ?all_lemma) 
  (SAMPLE(?lexeme) as ?a_lexeme) 
  (GROUP_CONCAT(DISTINCT ?lexcatLabel; separator=", ") as ?all_lexcat) 
  (COUNT(DISTINCT ?sense) as ?senses)
  (GROUP_CONCAT(DISTINCT ?glos_en; separator=" [other sense: ]_") as ?all_glos_en) 

WHERE
{   
{    SELECT * { 
{  SELECT ?a ?title (REPLACE(CONCAT(REPLACE(REPLACE(REPLACE(str(?title),'[\\.:,\\? %";\\]\\[]',""),"[\\(\\)–“—”]"," "),"’","'"), 
  " Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z")
  ,"[ ]{2,100}"," ") as ?input)
  #skip some characters in title
  #pad with " Z" in case the title is shorter than 42 words. "Z" is filtered out later. 
  WHERE
   {
      # { ?a wdt:P921 wd:Q84263196 } UNION { ?a wdt:P921 wd:Q81068910 }    UNION { ?a wdt:P921 wd:Q57751738 } 
      # UNION { ?a wdt:P921 wd:Q1134583 }  UNION { ?a wdt:P921 wd:Q82069695 }    UNION { ?a wdt:P921 wd:Q290805 }
      # above for COVID-19, use the following for ZIKA instead, add other items as needed
      ?a wdt:P921 wd:Q202864 .
      ?a wdt:P1476 ?title . FILTER(lang(?title)="en") } OFFSET 1000 LIMIT 200
} 
{    SELECT ?re ?nv WHERE { ?n wdt:P5176 []; wdt:P1181 ?nv . FILTER( ?nv < 43) 
     BIND( CONCAT( "^([^ ]+ ){", str( ?nv - 1 ),"}([^ ]+) .*") as ?re) }
} BIND( strlang(REPLACE(?input, ?re, "$2"),"en") as ?word) }
}    
    FILTER ( ?word != "Z"@en )
    # check against English forms of lexemes ("en")
    OPTIONAL { ?lf lhasform: ?word . ?lexeme1 lforms: ?lf ; llang: wd:Q1860 }

    # check against lowercase forms 
    BIND( Lcase(?word) as ?lword)
    OPTIONAL { ?lf lhasform: ?lword . ?lexeme2 lforms: ?lf ; llang: wd:Q1860 }

    # check against noun forms with 's (incomplete, not en-gb, not different cases)
    BIND( strlang(REPLACE(str(?word),"^(.+)'s$", "$1"),"en") as ?noun)
    OPTIONAL { ?lf lhasform: ?noun . ?lexeme5 lforms: ?lf ; llang: wd:Q1860 ; lcat: wd:Q1084 }

    # check against British-English lowercase forms ("en-gb")
    BIND( strlang(Lcase(str(?word)),"en-gb") as ?lword_engb)
    OPTIONAL { ?lf lhasform: ?lword_engb . ?lexeme3 lforms: ?lf ; llang: wd:Q1860 }

    # exclude digits-only strings and 1-letter strings
    FILTER ( !REGEX( str(?word), "^\\d+$" )  )
    FILTER ( strlen( stR(?word)) != 1  ) 

    BIND( COALESCE( ?lexeme1, ?lexeme2, ?lexeme5, ?lexeme3, wd:L254535 ) as ?lexeme)
    ?lexeme llemma: ?lemma ; lcat: ?lexcat .
    ?lexcat rdfs:label ?lexcatLabel . FILTER(lang(?lexcatLabel) = "en") .
    OPTIONAL { ?lexeme lsenses: ?sense .
               OPTIONAL { ?sense lglos: ?glos_en . FILTER(lang(?glos_en) = "en")   }
             }
}
GROUP BY ?a ?title ?nv ?word
ORDER BY ?a ?nv

Query found at

https://www.wikidata.org/wiki/Wikidata:Request_a_query/Archive/2020/03

graph TD classDef projected fill:lightgreen; classDef literal fill:orange; classDef iri fill:yellow; v1("?a"):::projected v23("?a_lexeme") v26("?all_glos_en") v22("?all_lemma") v24("?all_lexcat") v20("?glos_en"):::projected v6("?input") v18("?lemma"):::projected v19("?lexcat") v3("?lexcatLabel"):::projected v17("?lexeme"):::projected v10("?lexeme1") v12("?lexeme2") v16("?lexeme3") v14("?lexeme5") v9("?lf") v11("?lword") v15("?lword_engb") v7("?n") v13("?noun") v2("?nv"):::projected v8("?re") v21("?sense"):::projected v25("?senses") v5("?title"):::projected v9("?word"):::projected a1((" ")) c6(["wd:Q202864"]):::iri c16(["wd:Q1084"]):::iri c14(["wd:Q1860"]):::iri f0[["?lexcatLabel = 'en'"]] f0 --> v3 f1[["string-length(str(?word)) != '1^^xsd:integer'"]] f1 --> v9 f2[["not regex(str(?word),'^\d+$')"]] f2 --> v9 f3[["?word != sZ^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#langString>'"]] f3 --> v9 f4[["?title = 'en'"]] f4 --> v5 v1 --"wdt:P921"--> c6 v1 --"wdt:P1476"--> v5 bind5[/"replace(concat(replace(replace(replace(str(?title),'#91;\.:,\? %";\#93;\#91;#93;',''),'#91;–“—”#93;',' '),'’','''),' Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z Z'),'#91; #93;{2,100}',' ')"/] v5 --o bind5 bind5 --as--o v6 f6[["?nv < '43^^xsd:integer'"]] f6 --> v2 v7 --"wdt:P5176"--> a1 v7 --"wdt:P1181"--> v2 bind7[/"concat('^(#91;^ #93;+ ){',str(?nv - '1^^xsd:integer'),'}(#91;^ #93;+) .*')"/] v2 --o bind7 bind7 --as--o v8 bind8[/"STRLANG(replace(?input,?re,'$2'),'en')"/] v6 --o bind8 v8 --o bind8 bind8 --as--o v9 subgraph optional0["(optional)"] style optional0 fill:#bbf,stroke-dasharray: 5 5; v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v9 v10 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9 v10 --http://purl.org/dc/terms/language--> c14 end bind9[/"lower-case(?word)"/] v9 --o bind9 bind9 --as--o v11 subgraph optional1["(optional)"] style optional1 fill:#bbf,stroke-dasharray: 5 5; v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v11 v12 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9 v12 --http://purl.org/dc/terms/language--> c14 end bind10[/"STRLANG(replace(str(?word),'^(.+)'s$','$1'),'en')"/] v9 --o bind10 bind10 --as--o v13 subgraph optional2["(optional)"] style optional2 fill:#bbf,stroke-dasharray: 5 5; v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v13 v14 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9 v14 --http://purl.org/dc/terms/language--> c14 v14 --http://wikiba.se/ontology#lexicalCategory--> c16 end bind11[/"STRLANG(lower-case(str(?word)),'en-gb')"/] v9 --o bind11 bind11 --as--o v15 subgraph optional3["(optional)"] style optional3 fill:#bbf,stroke-dasharray: 5 5; v9 -.http://www.w3.org/ns/lemon/ontolex#representation.-> v15 v16 --http://www.w3.org/ns/lemon/ontolex#lexicalForm--> v9 v16 --http://purl.org/dc/terms/language--> c14 end bind12[/"?lexeme1?lexeme2?lexeme5?lexeme3'wd:L254535'"/] v10 --o bind12 v12 --o bind12 v14 --o bind12 v16 --o bind12 bind12 --as--o v17 v17 --http://wikiba.se/ontology#lemma--> v18 v17 --http://wikiba.se/ontology#lexicalCategory--> v19 v19 --"rdfs:label"--> v3 subgraph optional4["(optional)"] style optional4 fill:#bbf,stroke-dasharray: 5 5; v17 -.http://www.w3.org/ns/lemon/ontolex#sense.-> v21 subgraph optional5["(optional)"] style optional5 fill:#bbf,stroke-dasharray: 5 5; v21 -.http://www.w3.org/2004/02/skos/core#definition.-> v20 end end bind18[/"?lemma"/] v18 --o bind18 bind18 --as--o v22 bind19[/"sample(?lexeme)"/] v17 --o bind19 bind19 --as--o v23 bind20[/"?lexcatLabel"/] v3 --o bind20 bind20 --as--o v24 bind21[/"count(?sense)"/] v21 --o bind21 bind21 --as--o v25 bind22[/"?glos_en"/] v20 --o bind22 bind22 --as--o v26