exist table:
id | name | attr1 | attr2 | attr3 | attr4 | attr5 | attr6 | attr7 | attr8 |
---|---|---|---|---|---|---|---|---|---|
1 | aaa | true | true | true | true | true | true | false | false |
2 | bbb | false | false | false | false | true | false | false | false |
3 | ccc | true | false | false | false | false | true | false | false |
target table:
id | name | attr | flag |
---|---|---|---|
1 | aaa | attr1 | true |
1 | aaa | attr2 | true |
1 | aaa | attr3 | true |
1 | aaa | attr4 | true |
… | … | … | … |
2 | bbb | attr1 | false |
2 | bbb | attr2 | false |
… | … | … | … |
python spark:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder \
.appName("Example") \
.getOrCreate()
data = [
(1, "aaa", true, true, true...),
(2, "bbb", false, false, false...),
(3, "ccc", true, false, false...)
]
columns = ["id", "name", "attr1", "attr2", "attr3"...]
df = spark.createDataFrame(data, columns)
cols_to_exclude = ["id", "name"]
cols_to_transform = [c for c in df.columns if c not in cols_to_exclude]
stack_expr = "stack({},{})".format(
len(cols_to_transform),
','.join(["'{}', {}".format(c, c) for c in cols_to_transform])
)
df_rows = df.selectExpr(*cols_to_exclude, stack_expr).select(*cols_to_exclude, col("col0").alias("attr"), col("col1").alias("flag"))