PubChem Details

From Chemical Informatics and Cyberinfrastructure Collaboratory

Contents

Table Summary

The local mirror aims to provide a more or less comprehensive copy of the data in PubChem. Currently there are 6 tables

  • pubchem_compound
  • pubchem_substance
  • pubchem_synonym
  • pubchem_subst_syn
  • pubchem_scass
  • pubchem_derived

Example Queries

  • Select all the rows from the compound synonym table that contain acetyl as part of the synonym
select * from pubchem_synonym where synonym ~* 'acetyl';
  • Get the unique ID's for the compounds whose synonyms contain the phrase acetyl
select * from ( select distinct on (cid) cid, ccid_synid from pubchem_synonym
          where synonym ~* 'acetyl') temp order by ccid_synid;
  • Get the SMILES for entries that have a specified synonym
select distinct openeye_can_smiles 
       from pubchem_compound where cid in 
        ( select distinct cid from pubchem_synonym 
                where lower(synonym) = 'morphine');
  • Get the SMILES for compounds whose SlogP is in a specified range
select  distinct openeye_can_smiles 
from pubchem_compound where cid in 
        ( select cid from pubchem_derived where
              ((slogp >= 1.5) and (slogp <= 1.8034)) );

Table Schema

pubchem_compound (
cid                     varchar(256) primary key,
iupac_openeye_name      text,
iupac_cas_name          text,
iupac_name              text,
iupac_systematic_name   text,
iupac_traditional_name  text,
nist_inchi              text,
cactvs_xlogp            real,
cactvs_exact_mass       real check(cactvs_exact_mass >= 0),
openeye_mw              real check(openeye_mw >= 0),
openeye_can_smiles      text not null,
openeye_iso_smiles      text,
cactvs_tpsa             real,
total_charge            smallint,
heavy_atom_count        integer check(heavy_atom_count >= 0),
gfp                     bit(1024)
);
pubchem_substance (
sid                       varchar(256) primary key,
substance_version         integer check (substance_version > 0),
compound_id_type          integer check (compound_id_type >= 0),
ext_datasource_name       text,
ext_datasource_regid      text,
ob_smiles                 text
);
pubchem_synonym (
ccid_synid                bigserial not null primary key,
cid                       varchar(256) references pubchem_compound
                                       on update cascade
                                       on delete cascade,
synonym                   text not null,
unique(cid, synonym)
);
pubchem_subst_syn (
ccid_ssynid               bigserial primary key,
sid                       varchar(256) 
                                       references pubchem_substance
                                       on update cascade
                                       on delete cascade,
synonym                   text not null,
unique(sid, synonym)
);
pubchem_scass (
ccid_scid                 bigserial primary key,
sid                       varchar(256) references pubchem_substance (sid)
                                       on update cascade
                                       on delete cascade,
cid                       varchar(256) references pubchem_compound (cid)
                                       on update cascade
                                       on delete cascade,       
compound_id_type          integer check (compound_id_type >= 0),
unique(sid, cid, compound_id_type)
);
pubchem_derived (
cid                        varchar(256) primary key 
                                        references pubchem_compound (cid)
                                        on update cascade
                                        on delete cascade,
SlogP                      double precision default null,
SMRef                         double precision default null
);

Table Indices

compound_cansmi_idx on pubchem_compound (openeye_can_smiles)

syn_idx on pubchem_synonym (synonym)
lower_syn_index on pubchem_synonym (lower(synonym))

ssyn_idx on pubchem_subst_syn (synonym);
lower_ssyn_idx on pubchem_subst_syn (lower(synonym))

sid_idx on pubchem_scass (sid)

slogp_idx on pubchem_derived (slogp)