What happens? <a class="user-mention notranslate" data-hovercard-t

Thanks so much <a class="user-mention notranslate" data-hovercard-type="user" data-hov

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

Unnest not supported for deeply nested JSON about duckdb HOT 5 OPEN

jaanli commented on May 28, 2024

Unnest not supported for deeply nested JSON

from duckdb.

Comments (5)

cmdlineluser commented on May 28, 2024 1

I think you still want your CASE/WHEN check for when identifier is not an array.

duckdb.sql("""
FROM (
    SELECT json_extract(json(unnested_entry.entry), '$.resource') AS resource
    FROM read_json_auto('example.json', columns={'entry': 'JSON[]'}) AS entries,
         unnest(entries.entry) AS unnested_entry
    WHERE json_extract_string(json(unnested_entry.entry), '$.resource.resourceType') = 'Patient'
) AS patient_resource
SELECT
   json_transform(json_extract(resource, '$.identifier'), '[{"type":{"text":"VARCHAR"},"value":"VARCHAR"}]') AS identifiers
""")

┌──────────────────────────────────────────────────────────────┐
│                         identifiers                          │
│   struct("type" struct("text" varchar), "value" varchar)[]   │
├──────────────────────────────────────────────────────────────┤
│ NULL                                                         │
│ [{'type': {'text': Social Security Number}, 'value': ssn-2}] │
└──────────────────────────────────────────────────────────────┘

json_transform will produce NULL in this case as the schema/type doesn't match. (json_transform_strict() can be used to raise an error instead.)

duckdb.sql("""
FROM (
    SELECT json_extract(json(unnested_entry.entry), '$.resource') AS resource
    FROM read_json_auto('example.json', columns={'entry': 'JSON[]'}) AS entries,
         unnest(entries.entry) AS unnested_entry
    WHERE json_extract_string(json(unnested_entry.entry), '$.resource.resourceType') = 'Patient'
) AS patient_resource
SELECT
    json_transform(
        CASE
        WHEN json_type(json_extract(resource, '$.identifier')) = 'ARRAY' THEN json_extract(resource, '$.identifier')
        ELSE JSON_ARRAY(json_extract(resource, '$.identifier'))
        END,
        '[{"type":{"text":"VARCHAR"},"value":"VARCHAR"}]'
    ) AS identifiers
""")

┌──────────────────────────────────────────────────────────────┐
│                         identifiers                          │
│   struct("type" struct("text" varchar), "value" varchar)[]   │
├──────────────────────────────────────────────────────────────┤
│ [{'type': {'text': Medical Record Number}, 'value': mrn-1}]  │
│ [{'type': {'text': Social Security Number}, 'value': ssn-2}] │
└──────────────────────────────────────────────────────────────┘

from duckdb.

adriens commented on May 28, 2024 1

Yup, I faced this one, had to to split it into two steps then put data back together.

from duckdb.

cmdlineluser commented on May 28, 2024

Hi there - just a fellow user here.

It seems the issue is you're trying to call unnest() on a json type?

duckdb.sql("""
with patient_data as (
   select '[{"type":{"text":"Medical Record Number"},"value":"mrn-1"}]'::json identifiers
)
from patient_data
select unnest(identifiers)
""")

# BinderException: Binder Error: UNNEST() can only be applied to lists, structs and NULL

You can json_transform() it into structs/lists

select unnest(json_transform(identifiers, '[{"type":{"text":"VARCHAR"},"value":"VARCHAR"}]')) entry

┌───────────────────────────────────────────────────────────┐
│                           entry                           │
│  struct("type" struct("text" varchar), "value" varchar)   │
├───────────────────────────────────────────────────────────┤
│ {'type': {'text': Medical Record Number}, 'value': mrn-1} │
└───────────────────────────────────────────────────────────┘

e.g. If I modify your first example:

example1.py

duckdb.sql("""
WITH patient_data AS (
    SELECT
    resource.id AS patient_id,
    json_transform(
       CASE WHEN json_type(resource.identifier) = 'OBJECT' THEN json_array(resource.identifier)
       ELSE resource.identifier
       END,
       '[{"type":{"text":"VARCHAR"},"value":"VARCHAR"}]'
    ) AS identifiers,
    json_transform(
       CASE WHEN json_type(resource.extension) = 'OBJECT' THEN json_array(resource.extension)
       ELSE resource.extension
       END,
       '[{"url":"VARCHAR","extension":[{"valueCoding":{"display":"VARCHAR"}}]}]'
    ) AS extensions
    FROM (
        SELECT unnested_entry.entry.resource AS resource
        FROM 'example.json' AS entries,
             unnest(entries.entry) AS unnested_entry
        WHERE unnested_entry.entry.resource.resourceType = 'Patient'
    ) AS patient_resource
),
identifier_unnested AS (
    SELECT
        patient_id,
        identifier
    FROM patient_data
    CROSS JOIN UNNEST(identifiers) AS t(identifier)
),
extension_unnested AS (
    SELECT
        patient_id,
        extension
    FROM patient_data
    CROSS JOIN UNNEST(extensions) AS t(extension)
)
SELECT * FROM identifier_unnested;
""")

┌────────────┬────────────────────────────────────────────────────────────┐
│ patient_id │                         identifier                         │
│  varchar   │   struct("type" struct("text" varchar), "value" varchar)   │
├────────────┼────────────────────────────────────────────────────────────┤
│ patient-1  │ {'type': {'text': Medical Record Number}, 'value': mrn-1}  │
│ patient-2  │ {'type': {'text': Social Security Number}, 'value': ssn-2} │
└────────────┴────────────────────────────────────────────────────────────┘

from duckdb.

jaanli commented on May 28, 2024

Thanks so much @cmdlineluser - that was it. Was able to fix immediately: rickokin/synthetic-healthcare-data#1

Much cleaner than what I was struggling with before :) appreciate your SQL sorcery!

https://github.com/rickokin/synthetic-healthcare-data/pull/1/files

from duckdb.

rickokin commented on May 28, 2024

@cmdlineluser - thanks for your help - I am trying to extract the longitude and latitude but am not having much luck in figuring out the correct syntax. My current query is as follows but when I try and index the extension,extension I get errors, etc. Appreciate your help if you have time.

`WITH patient_data AS (
SELECT
json_extract_string(resource, '$.id') AS patient_id,
json_transform(json_extract(resource, '$.identifier'), '[{"type":{"text":"VARCHAR"},"value":"VARCHAR"}]') AS identifiers,
json_transform(json_extract(resource, '$.extension'), '[{"url":"VARCHAR","valueString":"VARCHAR","valueCode":"VARCHAR","valueDecimal":"VARCHAR","valueAddress":{"city":"VARCHAR","state":"VARCHAR","country":"VARCHAR"},"extension":[{"valueCoding":{"display":"VARCHAR"}}]}]') AS extensions,
json_transform(json_extract(resource, '$.address'), '[{"city":"VARCHAR","state":"VARCHAR","postalCode":"VARCHAR","country":"VARCHAR","line":["VARCHAR"],"extension":[{"extension":[{"url":"VARCHAR","valueDecimal":"VARCHAR"}]}]}]') AS addresses,
json_extract_string(resource, '$.name[0].given[0]') AS first_name,
json_extract_string(resource, '$.name[0].family') AS last_name,
json_extract_string(resource, '$.name[0].given[1]') AS first_name_alt,
json_extract_string(resource, '$.name[0].prefix[0]') AS patient_name_prefix,
json_extract_string(resource, '$.name[0].use') AS patient_name_use,
json_extract_string(resource, '$.name[1].given[0]') AS first_name2,
json_extract_string(resource, '$.name[1].given[1]') AS first_name_alt2,
json_extract_string(resource, '$.name[1].family') AS last_name2,
json_extract_string(resource, '$.name[1].prefix[0]') AS patient_name_prefix2,
json_extract_string(resource, '$.gender') AS gender,
json_extract_string(resource, '$.birthDate') AS birth_date,
json_extract_string(resource, '$.telecom[0].system') AS telecom_system,
json_extract_string(resource, '$.telecom[0].value') AS telecom_value,
json_extract_string(resource, '$.telecom[0].use') AS telecom_use,
json_extract_string(resource, '$.maritalStatus.text') AS patient_marital_status,
json_extract_string(resource, '$.deceasedDateTime') AS patient_deceased_date,
json_extract_string(resource, '$.multipleBirthBoolean') AS patient_multiple_birth_ind
FROM (
SELECT json_extract(json(unnested_entry.entry), '$.resource') AS resource
FROM read_json_auto('Gudrun69_Shaunna800_Goyette777_d5e33bd1-960e-bcf4-e5f9-9a4afc6d5a5e.json', columns={'entry': 'JSON[]'}) AS entries,
unnest(entries.entry) AS unnested_entry
WHERE json_extract_string(json(unnested_entry.entry), '$.resource.resourceType') = 'Patient'
) AS patient_resource
),
identifier_unnested AS (
SELECT
patient_id,
unnest(identifiers) AS identifier
FROM patient_data
),
identifier_aggregated AS (
SELECT
patient_id,
MIN(CASE WHEN identifier.type.text = 'Medical Record Number' THEN identifier.value END) AS patient_mrn,
MIN(CASE WHEN identifier.type.text = 'Social Security Number' THEN identifier.value END) AS patient_ssn,
MIN(CASE WHEN identifier.type.text = 'Driver''s license number' THEN identifier.value END) AS patient_drivers_license_num,
MIN(CASE WHEN identifier.type.text = 'Passport Number' THEN identifier.value END) AS patient_passport_num
FROM identifier_unnested
GROUP BY patient_id
),
extension_unnested AS (
SELECT
patient_id,
unnest(extensions) AS extension
FROM patient_data
),
extension_aggregated AS (
SELECT
patient_id,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race' THEN extension.extension[1].valueCoding.display END) AS patient_core_race,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity' THEN extension.extension[1].valueCoding.display END) AS patient_core_ethnicity,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex' THEN extension.valueCode END) AS patient_birth_sex,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName' THEN extension.valueString END) AS patient_maiden_name,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/StructureDefinition/patient-birthPlace' THEN extension.valueAddress.city END) AS birthplace_city,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/StructureDefinition/patient-birthPlace' THEN extension.valueAddress.state END) AS birthplace_state,
MIN(CASE WHEN extension.url = 'http://hl7.org/fhir/StructureDefinition/patient-birthPlace' THEN extension.valueAddress.country END) AS birthplace_country,
MIN(CASE WHEN extension.url = 'http://synthetichealth.github.io/synthea/disability-adjusted-life-years' THEN extension.valueDecimal END) AS disability_adjusted_life_years,
MIN(CASE WHEN extension.url = 'http://synthetichealth.github.io/synthea/quality-adjusted-life-years' THEN extension.valueDecimal END) AS quality_adjusted_life_years

  FROM extension_unnested
  GROUP BY patient_id

),
addresses_unnested AS (
SELECT
patient_id,
unnest(addresses) AS address
FROM patient_data
),
combined_data AS (
SELECT
p.patient_id,
p.first_name,
p.last_name,
i.patient_mrn,
i.patient_ssn,
i.patient_drivers_license_num,
i.patient_passport_num,
e.patient_core_race,
e.patient_core_ethnicity,
e.patient_maiden_name,
e.patient_birth_sex,
e.birthplace_city,
e.birthplace_state,
e.birthplace_country,
e.disability_adjusted_life_years,
e.quality_adjusted_life_years,
a.address.line[1],
a.address.city,
a.address.state,
a.address.postalCode,
a.address.country,
a.address
FROM patient_data p
JOIN identifier_aggregated i on i.patient_id = p.patient_id
JOIN extension_aggregated e on e.patient_id = p.patient_id
JOIN addresses_unnested a on a.patient_id = p.patient_id
)

select * from combined_data
`
Sample JSON
Gudrun69_Shaunna800_Goyette777_d5e33bd1-960e-bcf4-e5f9-9a4afc6d5a5e.json
file attached

from duckdb.

Unnest not supported for deeply nested JSON about duckdb HOT 5 OPEN

Comments (5)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent