Do we actually use characters that require more than one byte?
I couldn’t find a way to identify those columns that actually had a character with a code point value greater than 255. As a result, I ended up writing the following. If you have better way, please reply.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | import java.sql.*; public class checkUnicode { public static void main(String args[]) { try { Class.forName("oracle.jdbc.driver.OracleDriver"); Connection conn = DriverManager.getConnection("jdbc:oracle:thin:*****/****@******:2484/fake_db_service"); PreparedStatement pst = conn.prepareStatement("select fake_col1, fake_col2 from large_fake_table sample(1)"); ResultSet rst = pst.executeQuery(); String val = ""; double tot = 0; double found = 0; while (rst.next()) { val = rst.getString(2); for (int i = 0; i < val.length(); i++) { if (val.codePointAt(i) > 255) { found++; System.out.println(rst.getInt(1) + " has a code point value of " + val.codePointAt(i)); break; } } tot++; if (tot % 100 == 0) { System.out.println("Have checked " + tot + " rows, " + ((found / tot) * 100) + "% of which have unicode."); } } System.out.println("Checked " + tot + " rows"); } catch (Exception e) { e.printStackTrace(); } } } |