如果您已经打开了 门 文章,那么确保您已经使用ClickHouse,并且您会错过有关它的便利性和速度的有趣细节,并且直截了当-实际上,如何在ClickHouse中创建字典并使用它们。
ClickHouse中的词典是什么?
— key
→ value
. ClickHouse, ETL- .
ClickHouse :
- ClickHouse .
-
TTL
– ClickHouse . - ClickHouse — XML- DDL-.
: / , HTTP(s) , ..
xml-, dictionaries_config
ClickHouse.
, dictionaries_lazy_load
.
( ) — .
, , system.dictionaries
, :
- ;
- ;
- , RAM .
xml DDL-. , — DDL-.
xml :
<yandex>
<!-- , -->
<comment>Some comments</comment>
<!-- , -->
<include_from>/etc/metrika.xml</include_from>
<dictionary>
<!-- -->
</dictionary>
...
<dictionary>
<!-- -->
</dictionary>
</yandex>
DDL-, .
:
<dictionary>
<name>clients</name>
<surce>
<clickhouse>
<host>myHostName</host>
<port>9000</port>
<user>admin</user>
<password>secret_password</password>
<db>clients</db>
<table>users</table>
<where>id<=10</where>
</clickhouse>
</surce>
<lifetime>
<min>3600</min>
<max>5400</max>
</lifetime>
<layout>
<flat/>
</layout>
<structure>
<id>user_id</id>
<attribute>
<name>username</name>
<type>string</type>
</attribute>
<attribute>
<name>age</name>
<type>Int8</type>
</attribute>
</structure>
</dictionary>
:
name
— ;source
— ;lifetime
— ;layout
— . ;structure
— . , .
DDL-:
CREATE DICTIONARY dict_users_id (
id UInt64,
username String,
email String,
status UInt16,
hash String
)
PRIMARY KEY id
SOURCE(MYSQL(
port 3306
user clickhouse
password secret_password
replica(host 'mysql1.fevlake.com' priority 1)
db fevlake_dicts
table users
))
LAYOUT(HASHED())
LIFETIME(MIN 3600 MAX 5400);
. — :
- HTTP(s)
— , .
:
<surce>
<file>
<path>/opt/dictionaries/clients.csv</path>
<format>CSV</format>
</file>
</surce>
:
path
— .format
— . ClickHouse.
DDL-:
SOURCE(FILE(path '/opt/dictionaries/clients.csv' format 'CSV'))
SETTINGS(format_csv_allow_single_quotes = 0)
MySQL .
:
<surce>
<mysql>
<port>3306</port>
<user>clickhouse</user>
<password>secret_password</password>
<replica>
<host>example01-1</host>
<priority>1</priority>
</replica>
<replica>
<host>example01-2</host>
<priority>1</priority>
</replica>
<db>db_name</db>
<table>table_name</table>
<where>id=10</where>
<invalidate_query>SQL_QUERY</invalidate_query>
</mysql>
</surce>
port
— MySQL.<replica>
.user
— MySQL.<replica>
.password
— MySQL.<replica>
.replica
— . .db
— .table
— .where
— .WHERE
MySQL, ,id >= 3 AND id < 10
( ).invalidate_query
— ( ).
DDL-:
SOURCE(MYSQL(
port 3306
user clickhouse
password secret_password
replica(host 'mysql1.fevlake.com' priority 1)
db fevlake_dicts
table users
))
ClickHouse:
flat
hashed
sparse_hashed
cache
direct
range_hashed
complex_key_hashed
complex_key_cache
complex_key_direct
ip_trie
3, , — flat
, hashed
complex_key_hashed
. .
Flat
, . UInt64
500 000, ClickHouse .
.
:
<layout>
<flat/>
</layout>
LAYOUT(FLAT())
Hashed
- . , .
:
<layout>
<hashed/>
</layout>
LAYOUT(HASHED())
omplex_key_hashed
. hashed
.
:
<layout>
<hashed/>
</layout>
LAYOUT(COMPLEX_KEY_HASHED())
<structure>
, .
XML:
<structure>
<id>user_id</id>
<attribute>
<name>username</name>
<type>string</type>
</attribute>
<attribute>
<name>age</name>
<type>Int8</type>
</attribute>
</structure>
:
<id>
— ;<attribute>
— . .
ClickHouse :
- .
UInt64
.<id>
PRIMARY KEY
. - . .
<key>
PRIMARY KEY
.
: UInt64
.
:
<id>
<name>user_id</name>
</id>
CREATE DICTIONARY (
user_id UInt64,
...
)
PRIMARY KEY user_id
...
PRIMARY KEY
– .
(tuple
) . layout
complex_key_hashed
complex_key_cache
.
<key>
. , . :
<key>
<attribute>
<name>field1</name>
<type>String</type>
</attribute>
<attrbute>
<name>field2</name>
<type>UInt32</type>
</attribute>
...
</key>
CREATE DICTIONARY ( field1 String, field2 String ... )
PRIMARY KEY field1, field2
...
<structure>
...
<attribute>
<name>Name</name>
<type>ClickHouseDataType</type>
<null_value></null_value>
<expression>rand64()</expression>
<hierarchical>true</hierarchical>
<injective>true</injective>
<is_object_id>true</is_object_id>
</attribute>
</structure>
CREATE DICTIONARY somename (
Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID
)
ClickHouse
ClickHouse — IP (v4) .
, : ip String
country String
. GeoIP2 MaxMind.
MaxMind .mmdb API .
ClickHouse .mmdb, – MaxMind CSV, .
IP , :
GeoIP2-Country-Blocks-IPv4.csv
– IP ID ;GeoIP2-Country-Locations-en.csv
– .
, DDL:
CREATE DICTIONARY dicts.geoip_country_blocks_ipv4 (
network String DEFAULT '',
geoname_id UInt64 DEFAULT 0,
registered_country_geoname_id UInt64 DEFAULT 0,
represented_country_geoname_id UInt64 DEFAULT 0,
is_anonymous_proxy UInt8 DEFAULT 0,
is_satellite_provider UInt8 DEFAULT 0
)
PRIMARY KEY network
SOURCE(FILE(
path '/var/lib/clickhouse/user_files/GeoIP2-Country-Blocks-IPv4.csv'
format 'CSVWithNames'
))
LAYOUT(IP_TRIE())
LIFETIME(300);
geoip_country_blocks_ipv4
:
network
– IP , .geoname_id
– ID .
– CSV.
ClickHouse ID, ip_trie
. IP .
geoip_country_locations_en
:
CREATE DICTIONARY dicts.geoip_country_locations_en (
geoname_id UInt64 DEFAULT 0,
locale_code String DEFAULT '',
continent_code String DEFAULT '',
continent_name String DEFAULT '',
country_iso_code String DEFAULT '',
country_name String DEFAULT '',
is_in_european_union UInt8 DEFAULT 0
)
PRIMARY KEY geoname_id
SOURCE(FILE(
path '/var/lib/clickhouse/user_files/GeoIP2-Country-Locations-en.csv'
format 'CSVWithNames'
))
LAYOUT(HASHED())
LIFETIME(300);
ID . GeoIP2-Country-Locations-en.csv
:
geoname_id
– ID , , .country_name
– .
hashed
.
CSV .
, :
CREATE TEMPORARY TABLE user_visits (user_ip String, user_id UUID);
---
INSERT INTO user_visits VALUES
('178.248.237.68', generateUUIDv4()),
('82.192.95.175', generateUUIDv4());
. – dictGet*
:
SELECT
dictGetString('dicts.geoip_city_locations_en', 'country_name', users_country_id) AS users_country,
uniqs
FROM (
SELECT
dictGetUInt64('dicts.geoip_country_blocks_ipv4', 'geoname_id', tuple(IPv4StringToNum(user_ip))) AS users_country_id,
uniq(user_id) AS uniqs
FROM user_visits
GROUP BY users_country_id
);
:
┌─users_country─┬─uniqs─┐
│ Russia │ 1 │
│ Netherlands │ 1 │
└───────────────┴───────┘
2 rows in set. Elapsed: 0.003 sec.
:
-
user_ip
,ip_trie
-:tuple(IPv4StringToNum(user_ip))
; - , ID
users_country_id
:dictGetUInt64('geoip_country_blocks_ipv4', 'geoname_id', ...) as users_country_id
; - :
uniq(user_id) as uniq_users
; - ID , :
GROUP BY users_country_id
; - , ID , :
dictGetString('geoip_city_locations_en', 'country_name', users_country_id) AS users_country
.
. GeoIP2 , :)
. , ClickHouse .